mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-26 06:01:49 +00:00
Add GSM8k agent env using proper HermesAgentBaseEnv (not ICL)
- environments/gsm8k_agent_env.py: Math reasoning with Python REPL tool - Subclasses HermesAgentBaseEnv (proper tools= parameter, not ICL) - Uses ATROPOS_SERVER_* env vars from .env - Hermes tool call parser, configurable per model - Math verification via math_verify with string fallback - Tested: process mode works, both trajectories scored 1.0 - Updated memory bank with consolidation plan: - environments/ is the canonical env system (proper tool calling) - atropos/backends/ kept as sandbox infrastructure - atropos/agent/ and atropos/envs/agent_env.py marked for removal
This commit is contained in:
parent
9dc27880cd
commit
975c849308
4 changed files with 555 additions and 155 deletions
350
environments/gsm8k_agent_env.py
Normal file
350
environments/gsm8k_agent_env.py
Normal file
|
|
@ -0,0 +1,350 @@
|
||||||
|
"""
|
||||||
|
GSM8kAgentEnv -- Math Reasoning with Tool Use (Python REPL)
|
||||||
|
|
||||||
|
An agentic RL environment where models solve GSM8k math problems using
|
||||||
|
a Python interpreter tool. Uses proper OpenAI-spec tool calling via
|
||||||
|
HermesAgentBaseEnv (not ICL).
|
||||||
|
|
||||||
|
The model:
|
||||||
|
1. Receives a math problem
|
||||||
|
2. Can call the `terminal` tool to run Python code (`python3 -c "..."`)
|
||||||
|
3. Provides a final answer in \\boxed{} format
|
||||||
|
4. Gets reward: 1.0 if correct, 0.0 if wrong
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Phase 1 (OpenRouter, no training):
|
||||||
|
python environments/gsm8k_agent_env.py process \\
|
||||||
|
--env.data_path_to_save_groups gsm8k_agent_output.jsonl
|
||||||
|
|
||||||
|
# Phase 2 (VLLM + Tinker training):
|
||||||
|
run-api
|
||||||
|
python launch_training.py --config configs/gsm8k_agent.yaml
|
||||||
|
python environments/gsm8k_agent_env.py serve
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
# Ensure repo root is on sys.path
|
||||||
|
_repo_root = Path(__file__).resolve().parent.parent
|
||||||
|
if str(_repo_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_repo_root))
|
||||||
|
|
||||||
|
from atroposlib.envs.base import ScoredDataGroup
|
||||||
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
||||||
|
from atroposlib.type_definitions import Item
|
||||||
|
|
||||||
|
from environments.agent_loop import AgentResult
|
||||||
|
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
|
||||||
|
from environments.tool_context import ToolContext
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Math verification helpers
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def _verify_math_answer(model_response: str, gold_answer: str) -> bool:
|
||||||
|
"""
|
||||||
|
Verify if the model's response contains the correct answer.
|
||||||
|
Uses math_verify for robust LaTeX comparison, falls back to string matching.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from latex2sympy2_extended import NormalizationConfig
|
||||||
|
from math_verify import LatexExtractionConfig, parse, verify
|
||||||
|
|
||||||
|
gold_parsed = parse(
|
||||||
|
f"\\boxed{{{gold_answer}}}",
|
||||||
|
extraction_mode="first_match",
|
||||||
|
extraction_config=[LatexExtractionConfig()],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Strip <think> blocks if present
|
||||||
|
answer_text = model_response
|
||||||
|
if "</think>" in answer_text:
|
||||||
|
answer_text = answer_text.split("</think>")[-1]
|
||||||
|
|
||||||
|
answer_parsed = parse(
|
||||||
|
answer_text,
|
||||||
|
extraction_config=[
|
||||||
|
LatexExtractionConfig(
|
||||||
|
normalization_config=NormalizationConfig(
|
||||||
|
nits=False,
|
||||||
|
malformed_operators=False,
|
||||||
|
basic_latex=True,
|
||||||
|
boxed="all",
|
||||||
|
units=True,
|
||||||
|
),
|
||||||
|
boxed_match_priority=0,
|
||||||
|
try_extract_without_anchor=False,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
extraction_mode="first_match",
|
||||||
|
)
|
||||||
|
|
||||||
|
return bool(verify(answer_parsed, gold_parsed))
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
# Fallback: simple string matching for \\boxed{answer}
|
||||||
|
import re
|
||||||
|
pattern = r'\\boxed\{([^}]+)\}'
|
||||||
|
matches = re.findall(pattern, model_response)
|
||||||
|
if matches:
|
||||||
|
model_answer = matches[-1].strip().replace(",", "")
|
||||||
|
gold_clean = gold_answer.strip().replace(",", "")
|
||||||
|
return model_answer == gold_clean
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Environment Config
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class GSM8kAgentEnvConfig(HermesAgentEnvConfig):
|
||||||
|
"""Config with defaults for GSM8k agent environment."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Environment
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class GSM8kAgentEnv(HermesAgentBaseEnv):
|
||||||
|
"""
|
||||||
|
GSM8k math environment with Python REPL tool calling.
|
||||||
|
|
||||||
|
Models solve grade-school math problems by reasoning step by step
|
||||||
|
and using Python (via the terminal tool) for calculations.
|
||||||
|
|
||||||
|
Exercises the full agentic RL training loop:
|
||||||
|
- Model receives math problem
|
||||||
|
- Makes tool calls to compute (python3 -c "...")
|
||||||
|
- Provides final answer in \\boxed{}
|
||||||
|
- Reward: binary (1.0 correct, 0.0 wrong)
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "gsm8k-agent"
|
||||||
|
env_config_cls = GSM8kAgentEnvConfig
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def config_init(cls) -> Tuple[GSM8kAgentEnvConfig, List[APIServerConfig]]:
|
||||||
|
"""
|
||||||
|
Default config using terminal tool.
|
||||||
|
|
||||||
|
Reads from environment variables (set in .env):
|
||||||
|
ATROPOS_SERVER_BASE_URL - Inference server URL
|
||||||
|
ATROPOS_SERVER_MODEL - Model name on the server
|
||||||
|
ATROPOS_TOKENIZER_NAME - HuggingFace tokenizer name
|
||||||
|
ATROPOS_SERVER_API_KEY - API key for the server
|
||||||
|
"""
|
||||||
|
# Resolve inference server settings from env
|
||||||
|
base_url = (
|
||||||
|
os.getenv("ATROPOS_SERVER_BASE_URL")
|
||||||
|
or os.getenv("OPENAI_BASE_URL")
|
||||||
|
or os.getenv("LLM_BASE_URL")
|
||||||
|
or "https://openrouter.ai/api/v1"
|
||||||
|
)
|
||||||
|
if not base_url.rstrip("/").endswith("/v1"):
|
||||||
|
base_url = base_url.rstrip("/") + "/v1"
|
||||||
|
|
||||||
|
model = (
|
||||||
|
os.getenv("ATROPOS_SERVER_MODEL")
|
||||||
|
or os.getenv("LLM_MODEL")
|
||||||
|
or "Hermes-4.3-36B"
|
||||||
|
)
|
||||||
|
|
||||||
|
api_key = (
|
||||||
|
os.getenv("ATROPOS_SERVER_API_KEY")
|
||||||
|
or os.getenv("NOUS_API_KEY")
|
||||||
|
or os.getenv("OPENROUTER_API_KEY")
|
||||||
|
or os.getenv("OPENAI_API_KEY")
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = (
|
||||||
|
os.getenv("ATROPOS_TOKENIZER_NAME")
|
||||||
|
or os.getenv("ATROPOS_TOKENIZER")
|
||||||
|
or "NousResearch/Hermes-4.3-36B"
|
||||||
|
)
|
||||||
|
|
||||||
|
env_config = GSM8kAgentEnvConfig(
|
||||||
|
# Terminal tool only -- model uses `python3 -c "..."` for math
|
||||||
|
enabled_toolsets=["terminal"],
|
||||||
|
disabled_toolsets=None,
|
||||||
|
distribution=None,
|
||||||
|
# Agent settings
|
||||||
|
max_agent_turns=5, # Math problems don't need many turns
|
||||||
|
max_token_length=2048, # Room for reasoning + code
|
||||||
|
agent_temperature=1.0,
|
||||||
|
system_prompt=(
|
||||||
|
"You are a helpful math assistant. You have access to a terminal "
|
||||||
|
"where you can run Python code to help solve problems.\n\n"
|
||||||
|
"When you need to calculate something, use the terminal tool with "
|
||||||
|
"a command like: python3 -c \"print(2 + 2)\"\n\n"
|
||||||
|
"When you have the final answer, write it inside \\boxed{} like: \\boxed{42}\n\n"
|
||||||
|
"Work step by step. Use Python to verify your reasoning."
|
||||||
|
),
|
||||||
|
# Terminal backend (local for testing, modal for production)
|
||||||
|
terminal_backend=os.getenv("TERMINAL_ENV", "local"),
|
||||||
|
# Parser -- hermes format for Hermes models
|
||||||
|
tool_call_parser="hermes",
|
||||||
|
# Atropos settings
|
||||||
|
group_size=4,
|
||||||
|
tokenizer_name=tokenizer,
|
||||||
|
steps_per_eval=5,
|
||||||
|
total_steps=10,
|
||||||
|
use_wandb=bool(os.getenv("WANDB_API_KEY")),
|
||||||
|
wandb_name="gsm8k-agent",
|
||||||
|
ensure_scores_are_not_same=False,
|
||||||
|
# No external dataset (we load GSM8k ourselves)
|
||||||
|
dataset_name=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
server_configs = [
|
||||||
|
APIServerConfig(
|
||||||
|
base_url=base_url,
|
||||||
|
model_name=model,
|
||||||
|
server_type="openai",
|
||||||
|
api_key=api_key,
|
||||||
|
health_check=False,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
return env_config, server_configs
|
||||||
|
|
||||||
|
async def setup(self):
|
||||||
|
"""Load GSM8k dataset."""
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
self.train = load_dataset("gsm8k", "main", split="train").shuffle(seed=42)
|
||||||
|
test_data = load_dataset("gsm8k", "main", split="test").shuffle(seed=42)
|
||||||
|
self.test = [
|
||||||
|
{
|
||||||
|
"question": item["question"],
|
||||||
|
"gold_answer": item["answer"].split("#")[-1].strip().replace(",", ""),
|
||||||
|
}
|
||||||
|
for item in test_data
|
||||||
|
]
|
||||||
|
self.iter = 0
|
||||||
|
self.reward_buffer: List[float] = []
|
||||||
|
self.tool_use_buffer: List[int] = []
|
||||||
|
print(f"[GSM8kAgentEnv] Loaded {len(self.train)} train, {len(self.test)} test examples")
|
||||||
|
|
||||||
|
async def get_next_item(self) -> Dict[str, str]:
|
||||||
|
"""Cycle through training problems."""
|
||||||
|
item = self.train[self.iter % len(self.train)]
|
||||||
|
self.iter += 1
|
||||||
|
return {
|
||||||
|
"question": item["question"],
|
||||||
|
"gold_answer": item["answer"].split("#")[-1].strip().replace(",", ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
def format_prompt(self, item: Dict[str, str]) -> str:
|
||||||
|
"""Format the math problem as a user message."""
|
||||||
|
return item["question"]
|
||||||
|
|
||||||
|
async def compute_reward(
|
||||||
|
self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Score: verify the model's \\boxed{} answer against the gold answer.
|
||||||
|
|
||||||
|
The agent has full access to terminal via ctx, but for GSM8k we just
|
||||||
|
check the final answer from the conversation.
|
||||||
|
"""
|
||||||
|
# Get the last assistant message content
|
||||||
|
final_text = ""
|
||||||
|
for msg in reversed(result.messages):
|
||||||
|
if msg.get("role") == "assistant" and msg.get("content"):
|
||||||
|
final_text = msg["content"]
|
||||||
|
break
|
||||||
|
|
||||||
|
correct = _verify_math_answer(final_text, item["gold_answer"])
|
||||||
|
reward = 1.0 if correct else 0.0
|
||||||
|
|
||||||
|
self.reward_buffer.append(reward)
|
||||||
|
# Count tool calls in this trajectory
|
||||||
|
tool_call_count = sum(
|
||||||
|
len(msg.get("tool_calls", []))
|
||||||
|
for msg in result.messages
|
||||||
|
if msg.get("role") == "assistant"
|
||||||
|
)
|
||||||
|
self.tool_use_buffer.append(tool_call_count)
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
async def evaluate(self, *args, **kwargs):
|
||||||
|
"""Evaluate on a subset of the test set (greedy, no tools for speed)."""
|
||||||
|
start_time = time.time()
|
||||||
|
correct = 0
|
||||||
|
total = 0
|
||||||
|
samples = []
|
||||||
|
|
||||||
|
eval_subset = self.test[:30] # Small subset for quick eval
|
||||||
|
|
||||||
|
for item in eval_subset:
|
||||||
|
try:
|
||||||
|
completion = await self.server.chat_completion(
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": self.config.system_prompt or ""},
|
||||||
|
{"role": "user", "content": item["question"]},
|
||||||
|
],
|
||||||
|
n=1,
|
||||||
|
max_tokens=self.config.max_token_length,
|
||||||
|
temperature=0.0,
|
||||||
|
split="eval",
|
||||||
|
)
|
||||||
|
|
||||||
|
response = completion.choices[0].message.content or ""
|
||||||
|
is_correct = _verify_math_answer(response, item["gold_answer"])
|
||||||
|
|
||||||
|
if is_correct:
|
||||||
|
correct += 1
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
samples.append({
|
||||||
|
"question": item["question"],
|
||||||
|
"gold_answer": item["gold_answer"],
|
||||||
|
"response": response[:500],
|
||||||
|
"correct": is_correct,
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Eval failed: %s", e)
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
percent_correct = correct / total if total > 0 else 0
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
await self.evaluate_log(
|
||||||
|
metrics={"eval/percent_correct": percent_correct, "eval/total": total},
|
||||||
|
samples=samples,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||||
|
"""Log training metrics."""
|
||||||
|
if wandb_metrics is None:
|
||||||
|
wandb_metrics = {}
|
||||||
|
|
||||||
|
if self.reward_buffer:
|
||||||
|
wandb_metrics["train/percent_correct"] = sum(self.reward_buffer) / len(self.reward_buffer)
|
||||||
|
wandb_metrics["train/total_rollouts"] = len(self.reward_buffer)
|
||||||
|
self.reward_buffer = []
|
||||||
|
|
||||||
|
if self.tool_use_buffer:
|
||||||
|
wandb_metrics["train/avg_tool_calls"] = sum(self.tool_use_buffer) / len(self.tool_use_buffer)
|
||||||
|
wandb_metrics["train/tool_use_rate"] = sum(1 for t in self.tool_use_buffer if t > 0) / len(self.tool_use_buffer)
|
||||||
|
self.tool_use_buffer = []
|
||||||
|
|
||||||
|
await super().wandb_log(wandb_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
GSM8kAgentEnv.cli()
|
||||||
|
|
@ -1,61 +1,99 @@
|
||||||
# Active Context
|
# Active Context
|
||||||
|
|
||||||
## Current Focus
|
## Current Focus
|
||||||
Tinker RL training integration - pipeline fully wired up, waiting on Tinker billing to test.
|
Consolidating the two Atropos environment systems and fixing tool calling to use proper OpenAI-spec approach instead of ICL.
|
||||||
|
|
||||||
## Recently Completed (Feb 9, 2026)
|
## PR Feedback from Lead Dev (Feb 10, 2026)
|
||||||
|
|
||||||
### Tinker RL Training Integration
|
The PR was rejected because our approach has three fundamental issues:
|
||||||
Created a complete agent training pipeline using Tinker (Thinking Machines) + Atropos:
|
|
||||||
|
|
||||||
**New Files Created:**
|
### Issue 1: ManagedServer doesn't pass `tools={}` to `apply_chat_template()`
|
||||||
1. `tinker-atropos/tinker_atropos/environments/gsm8k_agent.py` - Agent GSM8k environment with:
|
- When using Phase 2 (VLLM/SGLang for RL training), `ManagedServer` needs to pass tools to `tokenizer.apply_chat_template(tools=...)`
|
||||||
- Python REPL tool calling (Hermes-style `<tool_call>` format)
|
- This makes the system prompt include tool definitions the way models were trained to expect
|
||||||
- Multi-step agent loop within `collect_trajectories()`
|
- **Fix**: Atropos PR #366 adds `tool_call_parser` support to ManagedServer (branch: `tool_call_support`)
|
||||||
- Math answer verification via `math_verify`
|
|
||||||
- Subprocess-based Python execution
|
|
||||||
- WandB metrics (percent_correct, tool_use_rate)
|
|
||||||
2. `tinker-atropos/configs/gsm8k_agent.yaml` - Config for Qwen3-4B-Instruct training
|
|
||||||
|
|
||||||
**Dependencies Updated:**
|
### Issue 2: ICL prompt vs proper tool calling
|
||||||
- `pyproject.toml` `[atropos]` extra now includes: tinker SDK, torch, wandb, math-verify
|
- Our code embeds tools as XML in the system prompt (`<tools>...</tools>`)
|
||||||
- Installed: tinker 0.12.0, tinker-atropos 0.1.0, torch (CPU)
|
- Proper approach: pass `tools=` parameter in `chat_completion()` calls and let the tokenizer's chat template handle formatting
|
||||||
|
- All Hermes datasets train on the proper format, not ICL
|
||||||
|
|
||||||
**README Updated:**
|
### Issue 3: Only Hermes `<tool_call>` parser, no multi-model support
|
||||||
- Added comprehensive "RL Training with Tinker" section with architecture diagram, quick start, config docs
|
- Our code only handles Hermes-style `<tool_call>` XML parsing
|
||||||
- Added TINKER_API_KEY and WANDB_API_KEY to optional keys table
|
- Proper approach: parser registry supporting 11+ model families (hermes, qwen, deepseek, llama, mistral, etc.)
|
||||||
|
|
||||||
**Verified Working:**
|
## Architecture: What Exists Now (Two Parallel Systems)
|
||||||
- Tinker SDK connection ✅
|
|
||||||
- All imports (tinker, tinker_atropos, trainer, environment) ✅
|
|
||||||
- Python REPL execution + tool call parsing ✅
|
|
||||||
- Math verification ✅
|
|
||||||
- Atropos run-api (port 8000) ✅
|
|
||||||
- Tinker trainer starts, loads config, creates inference server (port 8001) ✅
|
|
||||||
|
|
||||||
**Blocked:** Tinker billing (402 error) - user's payment didn't process (possibly regional card issue)
|
|
||||||
|
|
||||||
### Main Branch Merge (Feb 9, 2026)
|
|
||||||
Merged `origin/main` into `atropos-integrations` - 22,560 lines, 79 files, 5 conflicts resolved.
|
|
||||||
|
|
||||||
### Modal Backend (Feb 8, 2026)
|
|
||||||
Merged modal-integration branch, working with Modal Sandboxes.
|
|
||||||
|
|
||||||
### Singularity/Apptainer (Feb 6, 2026)
|
|
||||||
Completed and tested.
|
|
||||||
|
|
||||||
## Architecture: Training Pipeline
|
|
||||||
|
|
||||||
|
### `environments/` (Teknium's proper approach) ✅ CORRECT
|
||||||
```
|
```
|
||||||
Terminal 1: run-api (port 8000) - Atropos Rollout API
|
environments/
|
||||||
Terminal 2: launch_training.py (port 8001) - Tinker Trainer + FastAPI inference
|
├── agent_loop.py ← Uses tools= in chat_completion() (OpenAI spec)
|
||||||
Terminal 3: gsm8k_agent.py serve - Environment (generates trajectories)
|
├── hermes_base_env.py ← Phase 1 (OpenAI) + Phase 2 (ManagedServer + parser)
|
||||||
|
├── tool_context.py ← ToolContext for reward functions
|
||||||
|
├── tool_call_parsers/ ← 11 model parsers (hermes, qwen, deepseek, llama, etc.)
|
||||||
|
│ ├── __init__.py ← Registry with get_parser(), register_parser()
|
||||||
|
│ ├── hermes_parser.py
|
||||||
|
│ ├── qwen_parser.py
|
||||||
|
│ ├── deepseek_v3_parser.py
|
||||||
|
│ ├── llama_parser.py
|
||||||
|
│ ├── mistral_parser.py
|
||||||
|
│ └── ... (11 total)
|
||||||
|
├── terminal_test_env.py ← Working example: file creation tasks
|
||||||
|
├── hermes_swe_env.py ← SWE environment
|
||||||
|
└── patches.py ← Async-safe monkey patches
|
||||||
```
|
```
|
||||||
|
|
||||||
The agent env gets math problems → model calls Python REPL tool → scores answer → sends to Atropos → Tinker does LoRA training → updates sampling weights → repeat.
|
**How it works correctly:**
|
||||||
|
1. `HermesAgentLoop.run()` passes `tools=self.tool_schemas` to `chat_completion()`
|
||||||
|
2. ManagedServer passes tools to `tokenizer.apply_chat_template(tools=...)`
|
||||||
|
3. Parser registry reconstructs `tool_calls` from raw model output
|
||||||
|
4. Tool execution uses hermes-agent's `handle_function_call()` from `model_tools.py`
|
||||||
|
|
||||||
## Next Steps
|
### `atropos/` (Our sandbox-optimized code) - PARTIALLY REDUNDANT
|
||||||
- [ ] Resolve Tinker billing to test full training loop
|
```
|
||||||
- [ ] Run GSM8k agent training for ~20 steps (proof of concept)
|
atropos/
|
||||||
- [ ] Monitor WandB for reward improvement
|
├── agent/atropos_agent.py ← ICL-based agent (REDUNDANT with agent_loop.py)
|
||||||
- [ ] Graduate to more complex agent envs (SWE tasks with Modal backend)
|
├── envs/agent_env.py ← Environment with sandbox backends (PARTIALLY REDUNDANT)
|
||||||
|
├── envs/swe_smith_oracle_env.py ← SWE env using sandbox (KEEP - port to new base)
|
||||||
|
├── backends/ ← Sandbox backends (KEEP - valuable infrastructure)
|
||||||
|
│ ├── modal_backend.py ← Modal sandbox pool
|
||||||
|
│ ├── nomad_backend.py ← Nomad/Docker/Singularity
|
||||||
|
│ └── base.py ← ToolBackend protocol
|
||||||
|
├── slots/ ← Slot multiplexing (KEEP)
|
||||||
|
├── nomad/ ← Nomad client (KEEP)
|
||||||
|
├── tools/ ← Sandbox tool registry (PARTIALLY REDUNDANT)
|
||||||
|
└── sandbox_server.py ← HTTP server in containers (KEEP)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Plan: Consolidate into `environments/`
|
||||||
|
|
||||||
|
### What to KEEP from `atropos/`:
|
||||||
|
- `backends/` - Modal, Nomad, Singularity backends (valuable infrastructure for scale)
|
||||||
|
- `slots/` - Slot multiplexing
|
||||||
|
- `nomad/` - Nomad client
|
||||||
|
- `sandbox_server.py` - Container HTTP server
|
||||||
|
- `Dockerfile` - Sandbox container image
|
||||||
|
|
||||||
|
### What to REMOVE/REPLACE:
|
||||||
|
- `atropos/agent/atropos_agent.py` → replaced by `environments/agent_loop.py`
|
||||||
|
- `atropos/envs/agent_env.py` → functionality merged into `environments/hermes_base_env.py`
|
||||||
|
- `atropos/tools/` → replaced by `model_tools.py` + `tools/` (hermes-agent's standard tools)
|
||||||
|
|
||||||
|
### What to CREATE:
|
||||||
|
- `environments/gsm8k_agent_env.py` → GSM8k with tool calling, subclasses `HermesAgentBaseEnv`
|
||||||
|
- Update `environments/hermes_base_env.py` to optionally use sandbox backends (Nomad/Modal) for terminal isolation when needed for scale
|
||||||
|
|
||||||
|
### Steps:
|
||||||
|
1. Install atropos `tool_call_support` branch (PR #366)
|
||||||
|
2. Create `environments/gsm8k_agent_env.py` using `HermesAgentBaseEnv`
|
||||||
|
3. Port `swe_smith_oracle_env.py` to use `HermesAgentBaseEnv`
|
||||||
|
4. Make sandbox backends accessible from `HermesAgentBaseEnv` (terminal_backend config)
|
||||||
|
5. Remove redundant `atropos/agent/` and `atropos/envs/agent_env.py`
|
||||||
|
6. Clean up `atropos/tools/` (keep only sandbox-specific tools)
|
||||||
|
7. Update tinker-atropos gsm8k env to use proper base class
|
||||||
|
8. Test everything end-to-end
|
||||||
|
|
||||||
|
## Previous Completed Work
|
||||||
|
- Modal backend integration (Feb 8) - KEEP backends, update integration point
|
||||||
|
- Main branch merge (Feb 9) - completed
|
||||||
|
- Singularity/Apptainer (Feb 6) - KEEP
|
||||||
|
- Memory Bank initialized (Feb 5)
|
||||||
|
|
|
||||||
|
|
@ -1,96 +1,85 @@
|
||||||
# Progress
|
# Progress
|
||||||
|
|
||||||
|
## Current Sprint: Consolidate Environment Systems (Feb 10, 2026)
|
||||||
|
|
||||||
|
PR feedback from lead dev identified three fundamental issues with our approach:
|
||||||
|
1. Tool calling uses ICL (in-context learning) instead of proper `tools=` parameter
|
||||||
|
2. ManagedServer doesn't pass tools to `apply_chat_template()`
|
||||||
|
3. Only Hermes parser, no multi-model support
|
||||||
|
|
||||||
|
Teknium already built the correct approach in `environments/` directory. Our task is to consolidate.
|
||||||
|
|
||||||
|
### Status
|
||||||
|
- [ ] Install atropos `tool_call_support` branch (PR #366)
|
||||||
|
- [ ] Create `environments/gsm8k_agent_env.py` using `HermesAgentBaseEnv`
|
||||||
|
- [ ] Port SWE env to `HermesAgentBaseEnv`
|
||||||
|
- [ ] Make sandbox backends accessible from `HermesAgentBaseEnv`
|
||||||
|
- [ ] Remove redundant `atropos/agent/` and `atropos/envs/agent_env.py`
|
||||||
|
- [ ] Clean up redundant `atropos/tools/`
|
||||||
|
- [ ] Test end-to-end with Tinker
|
||||||
|
|
||||||
## Completed Features
|
## Completed Features
|
||||||
|
|
||||||
### ✅ Modal Backend Integration (Feb 8, 2026 - MERGED & TESTED)
|
### ✅ Modal Backend Integration (Feb 8, 2026)
|
||||||
Merged the `modal-integration` branch and fixed integration issues.
|
- `ModalToolBackend` with slot-based multiplexing
|
||||||
|
- Multi-profile support (CPU, GPU, high-memory)
|
||||||
|
- Auto-scaling sandbox pool via Modal Sandboxes
|
||||||
|
- **Status: KEEP backends, but change integration point from atropos/envs/ to environments/**
|
||||||
|
|
||||||
**What Works:**
|
### ✅ Main Branch Merge (Feb 9, 2026)
|
||||||
- `ModalToolBackend` implements full `ToolBackend` interface (start, stop, acquire, release, execute_batch)
|
- Merged 22,560 lines, 79 files, 5 conflicts resolved
|
||||||
- Modal Sandboxes used for long-lived containers (not Functions)
|
- New: hermes_cli/, file_operations, RL training tools, gateway, cron
|
||||||
- `sandbox.exec()` for direct command execution (no HTTP server needed)
|
|
||||||
- Slot-based multiplexing matching Nomad pattern
|
|
||||||
- Multi-profile support (`ModalSandboxConfig`, `_ModalMultiProfileManager`)
|
|
||||||
- YAML profile loading (`modal_profiles.yaml`)
|
|
||||||
- `AgentEnvConfig` fields for all Modal settings (`--env.modal_*`)
|
|
||||||
- `create_tool_backend()` supports `tool_pool_mode="modal"`
|
|
||||||
- Terminal tool (`tools/terminal_tool.py`) native Modal integration with pool management
|
|
||||||
- Named sandbox recovery via `Sandbox.from_name()`
|
|
||||||
- Auto-scaling sandbox pool per profile
|
|
||||||
- Artifact helpers (read, list, archive)
|
|
||||||
|
|
||||||
**CLI Usage:**
|
### ✅ Tinker RL Training Setup (Feb 9, 2026)
|
||||||
```bash
|
- tinker 0.12.0 + tinker-atropos installed
|
||||||
# Atropos backend
|
- GSM8k agent env created (needs rewrite to use proper base class)
|
||||||
python -m atropos.envs.swe_smith_oracle_env process \
|
- Config for Qwen3-4B created
|
||||||
--env.tool_pool_mode modal \
|
- Pipeline verified: Tinker API connection works, all imports pass
|
||||||
--env.modal_image python:3.11
|
- **Blocked on billing** (Tinker 402 error - regional payment issue)
|
||||||
|
|
||||||
# Terminal tool
|
### ✅ Singularity/Apptainer Sandbox (Feb 6, 2026)
|
||||||
TERMINAL_ENV=modal ./hermes
|
- Nomad raw_exec driver for HPC clusters
|
||||||
```
|
- All sandbox operations tested and working
|
||||||
|
|
||||||
**Files Modified/Created:**
|
### ✅ Memory Bank (Feb 5, 2026)
|
||||||
- `atropos/backends/modal_backend.py` - Full implementation (~1200 lines)
|
- Project documentation structure initialized
|
||||||
- `atropos/backends/__init__.py` - `create_tool_backend()` updated
|
|
||||||
- `atropos/envs/agent_env.py` - 15 Modal config fields added
|
|
||||||
- `tools/terminal_tool.py` - Native Modal sandbox pool
|
|
||||||
- `docs/MODAL_BACKEND.md` - Documentation
|
|
||||||
- `modal_profiles.yaml.example` - Example profiles
|
|
||||||
- `tests/test_modal_integration.py` - Integration tests
|
|
||||||
- `tests/test_modal_stress.py` - Stress tests
|
|
||||||
- `tests/test_modal_terminal.py` - Terminal tool tests
|
|
||||||
|
|
||||||
### ✅ Singularity/Apptainer Sandbox Integration (Feb 6, 2026 - FULLY TESTED)
|
## What to KEEP vs REMOVE
|
||||||
Adapted the Atropos sandbox environment from Docker to Singularity/Apptainer for HPC clusters.
|
|
||||||
|
|
||||||
**What Works:**
|
### KEEP (valuable infrastructure):
|
||||||
- `create_sandbox_job()` supports both `driver="docker"` and `driver="singularity"`
|
| Component | Location | Purpose |
|
||||||
- SlotPoolConfig and NomadBackendConfig propagate driver settings
|
|-----------|----------|---------|
|
||||||
- Singularity container runs sandbox_server.py via Nomad's raw_exec driver
|
| Modal backend | `atropos/backends/modal_backend.py` | Cloud sandbox pool |
|
||||||
- All sandbox operations work: bash execution, file read/write
|
| Nomad backend | `atropos/backends/nomad_backend.py` | Docker/Singularity sandboxes |
|
||||||
- **CLI arguments** `--env.driver` and `--env.singularity_image` for AgentEnvConfig
|
| Slot pool | `atropos/slots/` | Container multiplexing |
|
||||||
- **Static port binding** for Singularity (ReservedPorts vs DynamicPorts)
|
| Nomad client | `atropos/nomad/` | Nomad API |
|
||||||
|
| Sandbox server | `atropos/sandbox_server.py` | HTTP server in containers |
|
||||||
|
| Dockerfile | `atropos/Dockerfile` | Container image |
|
||||||
|
| Agent loop | `environments/agent_loop.py` | Proper OpenAI-spec tool calling |
|
||||||
|
| Base env | `environments/hermes_base_env.py` | Phase 1/2 with parsers |
|
||||||
|
| Tool parsers | `environments/tool_call_parsers/` | 11+ model parsers |
|
||||||
|
|
||||||
### ✅ Memory Bank Initialized (Feb 5, 2026)
|
### REMOVE (redundant with environments/):
|
||||||
Set up project documentation structure for context persistence.
|
| Component | Location | Replaced By |
|
||||||
|
|-----------|----------|-------------|
|
||||||
## In Progress
|
| ICL agent | `atropos/agent/atropos_agent.py` | `environments/agent_loop.py` |
|
||||||
None currently.
|
| AgentEnv | `atropos/envs/agent_env.py` | `environments/hermes_base_env.py` |
|
||||||
|
| Tool registry | `atropos/tools/` | `model_tools.py` + `tools/` |
|
||||||
|
| GSM8k ICL env | `tinker-atropos/.../gsm8k_agent.py` | New proper version |
|
||||||
|
|
||||||
## Known Issues
|
## Known Issues
|
||||||
- Modal backend not yet live-tested with actual Modal cloud credentials
|
- Tinker billing (402 error) - user's payment didn't process
|
||||||
- `bwrap_available: false` in Singularity containers
|
- `bwrap_available: false` in Singularity containers
|
||||||
- Health check timing - may need longer wait for container startup on slower systems
|
- atropos `tool_call_support` branch not yet installed (PR #366)
|
||||||
|
|
||||||
## What's Left to Build
|
|
||||||
|
|
||||||
### Modal Backend
|
|
||||||
- [ ] Live test with Modal credentials on actual cloud
|
|
||||||
- [ ] Test multi-profile GPU workflows
|
|
||||||
- [ ] Test sandbox recovery after restart
|
|
||||||
- [ ] Integrate with SWE-smith-oracle env for GRPO training loop
|
|
||||||
- [ ] Performance benchmarking vs Nomad backend
|
|
||||||
|
|
||||||
### HPC Deployment
|
|
||||||
- [ ] Test on actual HPC cluster with Slurm/PBS integration
|
|
||||||
- [ ] Document cluster-specific deployment procedures
|
|
||||||
|
|
||||||
### Documentation
|
|
||||||
- [ ] Add Singularity deployment to README
|
|
||||||
- [ ] Create HPC deployment skill in skills/mlops/
|
|
||||||
|
|
||||||
## Evolution of Decisions
|
## Evolution of Decisions
|
||||||
|
|
||||||
### Container Runtime Selection
|
### Agent Architecture
|
||||||
- **Initial**: Docker-only via Nomad docker driver
|
- **v1 (our branch)**: ICL-based agent with `<tool_call>` XML tags in system prompt
|
||||||
- **Problem**: HPC clusters don't allow Docker without sudo
|
- **v2 (Teknium's)**: Proper OpenAI-spec tool calling with `tools=` parameter
|
||||||
- **Solution**: Added Singularity/Apptainer support via raw_exec driver
|
- **Decision**: Adopt v2, consolidate into `environments/`, keep sandbox backends from v1
|
||||||
- **Result**: Both runtimes now supported with same API
|
|
||||||
|
|
||||||
### Modal Backend Architecture
|
### Environment Organization
|
||||||
- **Initial**: Stub placeholder raising RuntimeError
|
- **Before**: Two parallel systems (`atropos/envs/` and `environments/`)
|
||||||
- **Investigation**: Modal Sandboxes vs Functions - chose Sandboxes for long-lived containers
|
- **After**: Single system in `environments/`, using `HermesAgentBaseEnv` as base class
|
||||||
- **Design**: Direct `sandbox.exec()` instead of HTTP/sandbox_server.py (simpler, no networking needed)
|
- Sandbox backends remain in `atropos/backends/` but integrate via terminal backend config
|
||||||
- **Implementation**: Merged from `modal-integration` branch, fixed agent_env.py config fields
|
|
||||||
- **Result**: Three backends now supported: Nomad/Docker, Nomad/Singularity, Modal
|
|
||||||
|
|
|
||||||
|
|
@ -148,11 +148,50 @@ The agent validates responses before accepting:
|
||||||
4. `AIAgent` reads env vars when initializing terminal tool
|
4. `AIAgent` reads env vars when initializing terminal tool
|
||||||
5. Terminal tool creates appropriate backend based on `TERMINAL_ENV`
|
5. Terminal tool creates appropriate backend based on `TERMINAL_ENV`
|
||||||
|
|
||||||
## Atropos Backend Architecture
|
## RL Training Architecture (Consolidated)
|
||||||
|
|
||||||
|
### Environment System (`environments/`)
|
||||||
|
|
||||||
|
The canonical way to build agentic RL environments in Hermes-Agent:
|
||||||
|
|
||||||
### Backend Hierarchy
|
|
||||||
```
|
```
|
||||||
ToolBackend (Protocol - base.py)
|
environments/
|
||||||
|
├── agent_loop.py ← HermesAgentLoop: OpenAI-spec tool calling
|
||||||
|
├── hermes_base_env.py ← HermesAgentBaseEnv: base class for all envs
|
||||||
|
├── tool_context.py ← ToolContext: reward function tool access
|
||||||
|
├── tool_call_parsers/ ← 11+ model parsers (hermes, qwen, deepseek, etc.)
|
||||||
|
├── terminal_test_env.py ← Example: file creation tasks
|
||||||
|
├── hermes_swe_env.py ← SWE environment
|
||||||
|
└── gsm8k_agent_env.py ← GSM8k with Python REPL (TODO)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Two-Phase Operation
|
||||||
|
- **Phase 1 (OpenAI server)**: Native tool_calls from VLLM/SGLang/OpenRouter
|
||||||
|
- Good for: SFT data gen, testing, evaluation
|
||||||
|
- **Phase 2 (ManagedServer)**: Client-side tool call parser + logprob tracking
|
||||||
|
- Required for: RL training
|
||||||
|
- Parser registry selects per-model parser (hermes, qwen, llama, etc.)
|
||||||
|
|
||||||
|
### Key Design: Proper Tool Calling (NOT ICL)
|
||||||
|
```python
|
||||||
|
# CORRECT: pass tools= to chat_completion()
|
||||||
|
response = await server.chat_completion(
|
||||||
|
messages=messages,
|
||||||
|
tools=tool_schemas, # ← tokenizer.apply_chat_template(tools=...) formats these
|
||||||
|
temperature=1.0,
|
||||||
|
)
|
||||||
|
# Response has response.choices[0].message.tool_calls (structured objects)
|
||||||
|
|
||||||
|
# WRONG (old approach): embed tools in system prompt as XML
|
||||||
|
system_prompt = f"<tools>{json.dumps(tools)}</tools>" # ← ICL, not proper training format
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sandbox Backends (`atropos/backends/`)
|
||||||
|
|
||||||
|
Infrastructure for scaled sandbox execution (separate from the env system):
|
||||||
|
|
||||||
|
```
|
||||||
|
ToolBackend (Protocol)
|
||||||
├── NomadToolBackend → SlotPool → NomadClient + SandboxExecutor (HTTP)
|
├── NomadToolBackend → SlotPool → NomadClient + SandboxExecutor (HTTP)
|
||||||
│ ├── Docker driver (default)
|
│ ├── Docker driver (default)
|
||||||
│ └── Singularity driver (HPC)
|
│ └── Singularity driver (HPC)
|
||||||
|
|
@ -160,32 +199,16 @@ ToolBackend (Protocol - base.py)
|
||||||
└── _ModalMultiProfileManager (multi-profile support)
|
└── _ModalMultiProfileManager (multi-profile support)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Slot-Based Multiplexing Pattern
|
Accessed via `HermesAgentBaseEnv.terminal_backend` config option:
|
||||||
All backends share the same slot multiplexing concept:
|
- `local` - Direct execution (default, development)
|
||||||
- **Sandbox/Container**: Long-lived compute unit
|
- `docker` - Docker containers
|
||||||
- **Slot**: Isolated workspace directory within a sandbox (e.g., `/data/slot_0`)
|
- `modal` - Modal cloud sandboxes (production RL)
|
||||||
- **Trajectory**: One agent task using one slot
|
- `singularity` - HPC clusters
|
||||||
- Multiple trajectories share a sandbox via different slots
|
- `ssh` - Remote server
|
||||||
|
|
||||||
### Nomad Backend (HTTP-based)
|
### Training Pipeline (Tinker + Atropos)
|
||||||
- Deploys `sandbox_server.py` inside containers (Docker or Singularity)
|
```
|
||||||
- Uses `SandboxExecutor` for HTTP communication (POST /execute, POST /batch)
|
Terminal 1: run-api (port 8000) ← Atropos Rollout API
|
||||||
- Nomad manages container lifecycle (scaling, health checks)
|
Terminal 2: launch_training.py (port 8001) ← Tinker Trainer + inference
|
||||||
- Tools: bash, bash_stateful, read_file, write_file, tmux
|
Terminal 3: environment.py serve ← Environment (rollouts)
|
||||||
|
|
||||||
### Modal Backend (exec-based)
|
|
||||||
- Creates `modal.Sandbox` instances (long-lived containers)
|
|
||||||
- Uses `sandbox.exec("bash", "-c", command)` directly (no HTTP server)
|
|
||||||
- Modal manages container lifecycle (idle_timeout, max_lifetime)
|
|
||||||
- Multi-profile support: different resource configs (CPU, GPU, memory)
|
|
||||||
- Named sandboxes for recovery: `Sandbox.from_name(app_name, sandbox_name)`
|
|
||||||
- YAML config via `modal_profiles.yaml`
|
|
||||||
|
|
||||||
### Backend Selection
|
|
||||||
```python
|
|
||||||
# In agent_env.py / create_tool_backend()
|
|
||||||
if mode == "nomad":
|
|
||||||
return NomadToolBackend(NomadBackendConfig.from_agent_env_config(cfg))
|
|
||||||
if mode == "modal":
|
|
||||||
return ModalToolBackend(ModalSandboxConfig.from_agent_env_config(cfg))
|
|
||||||
```
|
```
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue