diff --git a/.env.example b/.env.example
index 4c13afec63..1f2cba1da5 100644
--- a/.env.example
+++ b/.env.example
@@ -22,12 +22,13 @@ HERMES_BACKEND=openai
# of OpenRouter.
#
# Local server convenience (base URL without /v1):
-# ATROPOS_SERVER_BASE_URL=http://localhost:11434
+# llama.cpp example (see `Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh`):
+# ATROPOS_SERVER_BASE_URL=http://127.0.0.1:8080
# ATROPOS_SERVER_MODEL=glm-4.7-flash
# ATROPOS_SERVER_API_KEY=local
#
# Generic OpenAI-compatible (base URL should include /v1):
-# OPENAI_BASE_URL=http://localhost:11434/v1
+# OPENAI_BASE_URL=http://127.0.0.1:8080/v1
# OPENAI_API_KEY=local
# =============================================================================
diff --git a/atropos/agent/atropos_agent.py b/atropos/agent/atropos_agent.py
index 9ea6e3044e..2fb1de6d68 100644
--- a/atropos/agent/atropos_agent.py
+++ b/atropos/agent/atropos_agent.py
@@ -15,6 +15,7 @@ The agent uses Hermes-style XML tags for tool calls:
import asyncio
import os
+import json
from contextlib import asynccontextmanager
from dataclasses import dataclass, field
from typing import Any, AsyncGenerator, Awaitable, Callable, Dict, List, Optional, Union
@@ -27,33 +28,66 @@ from atroposlib.envs.server_handling.managed_server import ManagedServer
load_dotenv()
-# Default system prompt with tool calling instructions
-AGENT_SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. You can use tools to accomplish tasks.
+# Default system prompt with tool calling instructions.
+#
+# IMPORTANT: In training-mode environments we want "raw text in -> raw text out" and we
+# parse tool calls from completion text. Do not rely on server-specific `tool_calls` fields.
+AGENT_SYSTEM_PROMPT = """You are a function-calling AI model.
-## Available Tools
+You are provided with function signatures within XML tags.
+You may call one or more functions to assist with the user query. If available tools are not relevant,
+respond in natural language.
+
+After calling & executing a function, you will be provided with function results within
+ XML tags.
+
+Here are the available tools:
-{tool_descriptions}
+{tools_json}
-## How to Use Tools
-To use a tool, output a tool call in the following format:
-{{"name": "tool_name", "arguments": {{"arg1": "value1", "arg2": "value2"}}}}
+## REQUIRED TOOL FORMAT
-You may reason about what to do before calling a tool:
-I need to check what files are in the current directory...
-{{"name": "bash", "arguments": {{"command": "ls -la"}}}}
+When you decide to call a tool, your assistant message MUST be:
+1) exactly one ... block, followed by
+2) one or more ... blocks,
+and NOTHING else in that message.
-After a tool is executed, you will receive the result:
-{{"success": true, "output": "..."}}
+For each tool call, output a JSON object with this schema:
+{"name": "function_name", "arguments": { ... }}
-Continue using tools as needed until you have completed the task.
-When you have finished, provide your final response without any tool calls.
+Each tool call MUST be enclosed within XML tags.
+The JSON inside MUST be valid JSON with double quotes.
-## Important Guidelines
-- Think step by step about what you need to do
-- Use tools to gather information and perform actions
-- If a tool call fails, analyze the error and try a different approach
-- Provide clear, concise responses when the task is complete
+Do NOT output in an assistant message.
+
+After you receive tool results, you may either call more tools (same required format) or provide the final answer.
+When providing the final answer, do NOT include any blocks.
+
+## ICL (examples)
+
+User: Show the current directory.
+Assistant:
+I should use the terminal tool to print the current directory.
+{"name": "terminal", "arguments": {"command": "pwd"}}
+User: {"success": true, "output": "/tmp\\n"}
+Assistant: /tmp
+
+User: List files, then count them.
+Assistant:
+I should list files and count lines.
+{"name": "terminal", "arguments": {"command": "ls -1 | wc -l"}}
+User: {"success": true, "output": "3\\n"}
+Assistant: 3
+
+User: Run pwd, then print ok.
+Assistant:
+I should run pwd, then run a command that prints ok.
+{"name": "terminal", "arguments": {"command": "pwd"}}
+{"name": "terminal", "arguments": {"command": "echo ok"}}
+User: {"success": true, "output": "/tmp\\n"}
+User: {"success": true, "output": "ok\\n"}
+Assistant: ok
"""
@@ -62,8 +96,9 @@ class AgentConfig:
"""Configuration for the AtroposAgent."""
# Generation parameters
- temperature: float = 0.7
- max_tokens: int = 4096
+ temperature: Optional[float] = 0.7
+ # Default to "let the backend decide" (important for tool-tag completions that may be longer).
+ max_tokens: Optional[int] = None
# Agent behavior
max_steps: int = 50
@@ -222,13 +257,53 @@ class AtroposAgent:
"""Build the system prompt with tool descriptions."""
if self.config.system_prompt:
return self.config.system_prompt
-
- tool_descriptions = self.tools.get_prompt_description()
- if not tool_descriptions:
- tool_descriptions = "(No tools available)"
-
- return AGENT_SYSTEM_PROMPT.format(tool_descriptions=tool_descriptions)
-
+
+ tools_json = self.tools.get_prompt_tool_definitions_json()
+ # Avoid `str.format()` here because the prompt contains many literal `{}` braces
+ # in JSON examples; we only want to substitute the single `{tools_json}` token.
+ return AGENT_SYSTEM_PROMPT.replace("{tools_json}", tools_json)
+
+ def _debug_dump_request(self, *, step_num: int, chat_kwargs: Dict[str, Any]) -> None:
+ if os.getenv("ATROPOS_DEBUG_AGENT_REQUEST") != "1":
+ return
+ try:
+ # Avoid dumping megabytes by default; messages can be huge.
+ meta = {
+ "step": step_num,
+ "chat_kwargs_keys": sorted(list(chat_kwargs.keys())),
+ "n": chat_kwargs.get("n"),
+ "max_tokens": chat_kwargs.get("max_tokens"),
+ "temperature": chat_kwargs.get("temperature"),
+ "num_messages": len(chat_kwargs.get("messages") or []),
+ }
+ print("\n=== ATROPOS_DEBUG_AGENT_REQUEST ===", flush=True)
+ print(meta, flush=True)
+
+ if os.getenv("ATROPOS_DEBUG_AGENT_REQUEST_FULL") == "1":
+ payload = dict(chat_kwargs)
+ # Make the payload more legible and less huge.
+ try:
+ dumped = json.dumps(payload, ensure_ascii=False, indent=2)
+ except Exception:
+ dumped = repr(payload)
+ print("\n=== ATROPOS_DEBUG_AGENT_REQUEST_FULL ===", flush=True)
+ print(dumped[:200_000], flush=True)
+ except Exception:
+ return
+
+ def _debug_dump_response(self, *, step_num: int, response: Any) -> None:
+ if os.getenv("ATROPOS_DEBUG_AGENT_RESPONSE") != "1":
+ return
+ print("\n=== ATROPOS_DEBUG_AGENT_RESPONSE ===", flush=True)
+ print({"step": step_num, "type": type(response).__name__}, flush=True)
+ try:
+ dumped = response.model_dump() # openai pydantic model
+ except Exception:
+ dumped = getattr(response, "__dict__", {"repr": repr(response)})
+ # Keep the dump bounded; we only need enough to see the assistant message content.
+ text = str(dumped)
+ print(text[:200_000], flush=True)
+
async def run(
self,
task: str,
@@ -265,12 +340,15 @@ class AtroposAgent:
# Keep a copy of the prompt messages used for this completion.
# Useful for reconstructing tokens/masks when state tracking is unavailable.
prompt_messages = list(messages)
- response = await managed.chat_completion(
- messages=messages,
- n=1,
- max_tokens=self.config.max_tokens,
- temperature=self.config.temperature,
- )
+ chat_kwargs: Dict[str, Any] = {"messages": messages, "n": 1}
+ if self.config.max_tokens is not None:
+ chat_kwargs["max_tokens"] = self.config.max_tokens
+ if self.config.temperature is not None:
+ chat_kwargs["temperature"] = self.config.temperature
+
+ self._debug_dump_request(step_num=step_num + 1, chat_kwargs=chat_kwargs)
+ response = await managed.chat_completion(**chat_kwargs)
+ self._debug_dump_response(step_num=step_num + 1, response=response)
current_node = None
if hasattr(managed, "get_state"):
@@ -286,7 +364,9 @@ class AtroposAgent:
error=f"Generation error: {str(e)}",
)
- response_text = response.choices[0].message.content or ""
+ msg = response.choices[0].message
+ # Some OpenAI-compatible servers populate `message.reasoning` and leave `content=""`.
+ response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "")
tool_calls = ToolCall.parse_from_text(response_text)
step = AgentStep(
@@ -380,12 +460,15 @@ class AtroposAgent:
Tuple of (response_text, tool_results, sequence_data)
"""
async with self._managed() as managed:
- response = await managed.chat_completion(
- messages=messages,
- n=1,
- max_tokens=self.config.max_tokens,
- temperature=self.config.temperature,
- )
+ chat_kwargs: Dict[str, Any] = {"messages": messages, "n": 1}
+ if self.config.max_tokens is not None:
+ chat_kwargs["max_tokens"] = self.config.max_tokens
+ if self.config.temperature is not None:
+ chat_kwargs["temperature"] = self.config.temperature
+
+ self._debug_dump_request(step_num=1, chat_kwargs=chat_kwargs)
+ response = await managed.chat_completion(**chat_kwargs)
+ self._debug_dump_response(step_num=1, response=response)
current_node = None
if hasattr(managed, "get_state"):
@@ -393,7 +476,8 @@ class AtroposAgent:
nodes = state.get("nodes", [])
current_node = nodes[-1] if nodes else None
- response_text = response.choices[0].message.content or ""
+ msg = response.choices[0].message
+ response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "")
tool_results = []
if execute_tools:
diff --git a/atropos/envs/agent_env.py b/atropos/envs/agent_env.py
index 0e88a22237..789ea88b7c 100644
--- a/atropos/envs/agent_env.py
+++ b/atropos/envs/agent_env.py
@@ -18,7 +18,7 @@ from pydantic import Field
from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig, Item, ScoredDataGroup, ScoredDataItem
-from ..agent import AgentConfig, AtroposAgent
+from ..agent import AgentConfig, AgentResult, AtroposAgent
from ..slots import SlotPool, SlotPoolConfig
from ..tools import ToolRegistry, build_tool_registry
from ..tools.tool_executor import ToolExecutor, ToolExecutorConfig
@@ -56,7 +56,10 @@ class AgentEnvConfig(BaseEnvConfig):
# basic agent defaults
agent_max_steps: int = Field(default=50, description="Max ReACT steps per trajectory")
agent_temperature: float = Field(default=0.7, description="Sampling temperature")
- agent_max_tokens: int = Field(default=4096, description="Max tokens per model response")
+ agent_max_tokens: Optional[int] = Field(
+ default=None,
+ description="Max tokens per model response (default: let backend decide)",
+ )
agent_tool_delay_s: float = Field(default=0.0, description="Delay between tool calls (seconds)")
# tool selection
@@ -143,6 +146,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
*,
trajectory_id: str,
exec_tool: Callable[["ToolCall"], Awaitable["ToolResult"]],
+ agent_result: Optional[AgentResult] = None,
) -> tuple[float, Dict[str, Any]]:
"""
Optional hook: run in-sandbox verification before scoring.
@@ -152,7 +156,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
Default: calls `score_trajectory()` and returns empty metadata.
"""
- _ = (trajectory_id, exec_tool) # default ignores in-workspace verification
+ _ = (trajectory_id, exec_tool, agent_result) # default ignores in-workspace verification
score = await self.score_trajectory(item, final_response)
return score, {}
@@ -299,6 +303,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
result.final_response,
trajectory_id=trajectory_id,
exec_tool=_exec,
+ agent_result=result,
)
messages = [{"role": "system", "content": agent._build_system_prompt()}] # noqa: SLF001
diff --git a/atropos/envs/hermes_compat_test_env.py b/atropos/envs/hermes_compat_test_env.py
index 270b507f83..93b0fe2dd9 100644
--- a/atropos/envs/hermes_compat_test_env.py
+++ b/atropos/envs/hermes_compat_test_env.py
@@ -1,70 +1,63 @@
"""
-Hermes-Agent (Atropos-compatible) smoke environment.
+Hermes-Agent + Atropos (Nomad sandbox) compatibility smoke environment.
-This is a minimal `BaseEnv` environment that uses Hermes-Agent's Atropos-backed
-runner (`AtroposAIAgent`) and can be exercised via `BaseEnv`'s `process` mode.
+This environment is intended to validate, end-to-end:
+ BaseEnv.process -> AgentEnv -> ToolExecutor (batched) -> Nomad SlotPool -> sandbox_server
-This deliberately does NOT use slot multiplexing / sandboxes yet (stage 1).
+It forces the model to use a sandbox tool by asking it to run a command that
+generates a high-entropy token inside the sandbox, then repeat it exactly.
+
+Run (process mode):
+ uv run python -m atropos.envs.hermes_compat_test_env process --env.use_wandb false --env.total_steps 2 --env.group_size 1
"""
from __future__ import annotations
-import json
import os
-import uuid
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Tuple
from dotenv import load_dotenv
from pydantic import Field
-from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig, Item
+from atroposlib.envs.base import APIServerConfig, Item
+
+from ..agent import AgentConfig, AgentResult
+from ..tools import ToolCall
+from .agent_env import AgentEnv, AgentEnvConfig
load_dotenv()
-def _build_forced_tool_item() -> Item:
- """
- Construct a task that *cannot* be completed reliably without executing a tool.
-
- We generate a high-entropy token *inside the tool execution* and ask the agent to
- repeat it exactly. Scoring verifies that:
- - a terminal tool call occurred (role="tool" message present), and
- - the final answer matches the tool stdout exactly.
- """
+def _forced_tool_item() -> Item:
+ # Use double quotes in the shell command and show JSON escaping explicitly.
+ # This avoids invalid JSON escapes like `\\'` (not valid JSON) that some models produce.
+ cmd = 'python -c "import secrets; print(secrets.token_hex(16))"'
return {
- "command": "python -c \"import secrets; print(secrets.token_hex(16))\"",
+ "command": cmd,
"prompt": (
- "Use the terminal tool to run:\n"
- "python -c \"import secrets; print(secrets.token_hex(16))\"\n"
- "Then answer with EXACTLY what it printed and nothing else."
+ "You are acting as an agent inside a sandboxed environment.\n"
+ "You MUST use the terminal tool to execute commands.\n"
+ "Run this exact command:\n"
+ f"{cmd}\n"
+ "When you call the tool, use valid JSON inside . Example:\n"
+ '{"name": "terminal", "arguments": {"command": '
+ '"python -c \\\\"import secrets; print(secrets.token_hex(16))\\\\""}}'
+ "\n"
+ "Then respond with EXACTLY what it printed (the hex token) and nothing else.\n"
+ "Do not guess. Do not explain."
),
}
-TEST_ITEMS: List[Item] = [
- _build_forced_tool_item(),
- _build_forced_tool_item(),
-]
-
-
-class HermesCompatTestEnvConfig(BaseEnvConfig):
- """Config for HermesCompatTestEnv."""
-
+class HermesCompatTestEnvConfig(AgentEnvConfig):
server_base_url: str = Field(
- default="http://localhost:11434",
+ default="http://127.0.0.1:8080",
description="Base URL for an OpenAI-compatible chat server (without /v1).",
)
server_model: str = Field(default="glm-4.7-flash", description="Model name")
-class HermesCompatTestEnv(BaseEnv):
- """
- Minimal BaseEnv that runs Hermes-Agent's Atropos-compatible agent loop.
-
- Run (process mode):
- uv run atropos-agent-hermes-compat-test process --env.use_wandb false --env.total_steps 2 --env.group_size 1
- """
-
+class HermesCompatTestEnv(AgentEnv[HermesCompatTestEnvConfig]):
name = "hermes_compat_test_env"
env_config_cls = HermesCompatTestEnvConfig
@@ -75,39 +68,22 @@ class HermesCompatTestEnv(BaseEnv):
slurm: bool = False,
testing: bool = False,
):
- super().__init__(config=config, server_configs=server_configs, slurm=slurm, testing=testing)
+ super().__init__(config, server_configs, slurm, testing)
self._iter = 0
- from atropos_compatible_agent import AtroposAIAgent # noqa: WPS433
-
- # Only expose terminal for this smoke env.
- self._agent = AtroposAIAgent(
- server=self.server,
- tokenizer=self.tokenizer,
- model=getattr(config, "server_model", "local"),
- max_iterations=8,
- enabled_toolsets=["terminal"],
- tool_delay=0.0,
- # Let the server decide token limits; we care about tool calling correctness here.
- max_tokens=None,
- temperature=None,
- )
-
@classmethod
def config_init(cls) -> Tuple[HermesCompatTestEnvConfig, List[APIServerConfig]]:
base_url = (
os.getenv("ATROPOS_SERVER_BASE_URL")
or os.getenv("OPENAI_BASE_URL")
or os.getenv("LLM_BASE_URL")
- or "http://localhost:11434"
+ or "http://127.0.0.1:8080"
)
model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
- # Never pass through real API keys in this smoke env (they will be printed by BaseEnv config logging).
- # Local OpenAI-compatible servers typically ignore the API key anyway.
- api_key = "local"
+ api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
env_config = HermesCompatTestEnvConfig(
- tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct",
+ tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct", # tokenization only
group_size=1,
use_wandb=False,
include_messages=True,
@@ -116,13 +92,18 @@ class HermesCompatTestEnv(BaseEnv):
batch_size=1,
server_base_url=base_url,
server_model=model,
+ # Tooling: sandbox-only terminal.
+ enabled_toolsets=["terminal"],
+ disabled_toolsets=[],
+ # Default to Nomad sandboxing; users can override via --env.* args.
+ sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local",
+ purge_job_on_shutdown=True,
)
server_configs = [
APIServerConfig(
- server_type="openai",
model_name=model,
- base_url=f"{base_url}/v1",
+ base_url=f"{base_url.rstrip('/')}/v1",
api_key=api_key,
num_max_requests_at_once=1,
num_requests_for_eval=1,
@@ -131,77 +112,55 @@ class HermesCompatTestEnv(BaseEnv):
]
return env_config, server_configs
- async def setup(self):
+ async def setup_agent_env(self) -> None:
return None
async def get_next_item(self) -> Item:
- # Regenerate token per task to avoid leakage across steps.
- item = _build_forced_tool_item()
self._iter += 1
- return item
+ return _forced_tool_item()
- async def collect_trajectory(self, item: Item):
- prompt = item.get("prompt", "")
+ def build_task(self, item: Item) -> str:
+ return str(item.get("prompt") or "")
- result = await self._agent.run_conversation_async(
- prompt,
- task_id=str(uuid.uuid4()),
+ def build_agent_config(self, item: Item) -> AgentConfig: # noqa: ARG002
+ # Avoid imposing max_tokens by default; tool-tag responses can be long for some models.
+ return AgentConfig(
+ max_steps=min(8, int(self.config.agent_max_steps)),
+ temperature=0.2,
+ max_tokens=None,
)
- final = (result.get("final_response") or "").strip()
+ async def score_trajectory(self, item: Item, final_response: str) -> float:
+ # Scoring happens in verify_and_score_trajectory so we can inspect tool results.
+ _ = (item, final_response)
+ return 0.0
+
+ async def verify_and_score_trajectory(
+ self,
+ item: Item,
+ final_response: str,
+ *,
+ trajectory_id: str, # noqa: ARG002
+ exec_tool, # noqa: ARG002
+ agent_result: AgentResult | None = None,
+ ) -> tuple[float, Dict[str, Any]]:
+ if agent_result is None:
+ return 0.0, {"error": "Missing agent_result"}
- # Verify the agent actually executed the tool by extracting stdout from the tool message.
observed: str = ""
- saw_tool = False
- for msg in result.get("messages", []):
- if msg.get("role") == "tool":
- saw_tool = True
- # Tool messages contain JSON strings from terminal tool.
- try:
- payload = json.loads(msg.get("content") or "{}")
- out = (payload.get("output") or "").strip()
- if out:
- observed = out.splitlines()[-1].strip()
- except Exception:
- continue
- # Pass if:
- # - a tool call occurred, and
- # - the final answer matches the observed stdout exactly.
- score = 1.0 if saw_tool and observed and final == observed else 0.0
+ tool_ok = False
+ for step in agent_result.steps:
+ for res in step.tool_results:
+ if not res.success:
+ return 0.0, {"error": res.error, "output": res.output}
+ out = (res.output or "").strip()
+ if out:
+ observed = out.splitlines()[-1].strip()
+ tool_ok = True
- # Tokenization fallback: build tokens/masks from final prompt + completion.
- # Note: this is sufficient for smoke testing; production training should
- # use a backend that supports ManagedServer state tracking.
- system_prompt = result.get("system_prompt")
- messages: List[Dict[str, str]] = result.get("messages", [])
- prompt_messages = messages[:-1] if messages and messages[-1].get("role") == "assistant" else messages
-
- if system_prompt:
- prompt_messages = [{"role": "system", "content": system_prompt}] + prompt_messages
-
- if hasattr(self.tokenizer, "apply_chat_template"):
- prompt_text = self.tokenizer.apply_chat_template(
- prompt_messages, tokenize=False, add_generation_prompt=True
- )
- prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
- else:
- prompt_text = "\n".join([f"{m['role']}: {m['content']}" for m in prompt_messages])
- prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=True)
-
- output_tokens = self.tokenizer.encode(final, add_special_tokens=False)
-
- scored = {
- "tokens": prompt_tokens + output_tokens,
- "masks": ([-100] * len(prompt_tokens)) + output_tokens,
- "scores": score,
- "messages": prompt_messages + [{"role": "assistant", "content": final}],
- }
-
- return scored, []
-
- async def evaluate(self, *args, **kwargs): # noqa: ARG002
- # Minimal eval hook for BaseEnv abstract method.
- return {}
+ final = (final_response or "").strip()
+ score = 1.0 if tool_ok and agent_result.total_tool_calls > 0 and observed and final == observed else 0.0
+ return score, {"observed": observed, "tool_calls": agent_result.total_tool_calls, "command": item.get("command")}
if __name__ == "__main__":
diff --git a/atropos/envs/sandbox_terminal_smoke_env.py b/atropos/envs/sandbox_terminal_smoke_env.py
new file mode 100644
index 0000000000..9c140a30b6
--- /dev/null
+++ b/atropos/envs/sandbox_terminal_smoke_env.py
@@ -0,0 +1,169 @@
+"""
+Nomad sandbox terminal smoke environment (training-oriented).
+
+Validates, end-to-end:
+ BaseEnv.process -> AgentEnv -> ToolExecutor (batched) -> Nomad SlotPool -> sandbox_server
+
+It forces the model to use a sandbox tool by asking it to run a command that
+generates a high-entropy token inside the sandbox, then repeat it exactly.
+
+Run (process mode):
+ uv run python -m atropos.envs.sandbox_terminal_smoke_env process --env.use_wandb false --env.total_steps 2 --env.group_size 1
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, List, Tuple
+
+from dotenv import load_dotenv
+from pydantic import Field
+
+from atroposlib.envs.base import APIServerConfig, Item
+
+from ..agent import AgentConfig, AgentResult
+from ..tools import ToolCall
+from .agent_env import AgentEnv, AgentEnvConfig
+
+load_dotenv()
+
+STRICT_TOOLCALL_SYSTEM_PROMPT = None
+
+
+def _forced_tool_item() -> Item:
+ # Use double quotes in the shell command and show JSON escaping explicitly.
+ # This avoids invalid JSON escapes like `\\'` (not valid JSON) that some models produce.
+ cmd = 'python -c "import secrets; print(secrets.token_hex(16))"'
+ return {
+ "command": cmd,
+ "prompt": (
+ "You MUST use the terminal tool.\n"
+ "Run this exact command:\n"
+ f"{cmd}\n"
+ "When you call the tool, use valid JSON inside . Example:\n"
+ '{"name": "terminal", "arguments": {"command": '
+ '"python -c \\\\"import secrets; print(secrets.token_hex(16))\\\\""}}'
+ "\n"
+ "Then respond with EXACTLY what it printed (the hex token) and nothing else.\n"
+ "Do not guess. Do not explain."
+ ),
+ }
+
+
+class SandboxTerminalSmokeEnvConfig(AgentEnvConfig):
+ server_base_url: str = Field(
+ default="http://127.0.0.1:8080",
+ description="Base URL for an OpenAI-compatible chat server (without /v1).",
+ )
+ server_model: str = Field(default="glm-4.7-flash", description="Model name")
+
+
+class SandboxTerminalSmokeEnv(AgentEnv[SandboxTerminalSmokeEnvConfig]):
+ name = "sandbox_terminal_smoke_env"
+ env_config_cls = SandboxTerminalSmokeEnvConfig
+
+ def __init__(
+ self,
+ config: SandboxTerminalSmokeEnvConfig,
+ server_configs: List[APIServerConfig],
+ slurm: bool = False,
+ testing: bool = False,
+ ):
+ super().__init__(config, server_configs, slurm, testing)
+ self._iter = 0
+
+ @classmethod
+ def config_init(cls) -> Tuple[SandboxTerminalSmokeEnvConfig, List[APIServerConfig]]:
+ base_url = (
+ os.getenv("ATROPOS_SERVER_BASE_URL")
+ or os.getenv("OPENAI_BASE_URL")
+ or os.getenv("LLM_BASE_URL")
+ or "http://127.0.0.1:8080"
+ )
+ model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
+ api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
+
+ env_config = SandboxTerminalSmokeEnvConfig(
+ tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct", # tokenization only
+ group_size=1,
+ use_wandb=False,
+ include_messages=True,
+ ensure_scores_are_not_same=False,
+ total_steps=2,
+ batch_size=1,
+ server_base_url=base_url,
+ server_model=model,
+ # Tooling: sandbox-only terminal.
+ enabled_toolsets=["terminal"],
+ disabled_toolsets=[],
+ # Default to Nomad sandboxing; users can override via --env.* args.
+ sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local",
+ purge_job_on_shutdown=True,
+ )
+
+ server_configs = [
+ APIServerConfig(
+ model_name=model,
+ base_url=f"{base_url.rstrip('/')}/v1",
+ api_key=api_key,
+ num_max_requests_at_once=1,
+ num_requests_for_eval=1,
+ timeout=120,
+ )
+ ]
+ return env_config, server_configs
+
+ async def setup_agent_env(self) -> None:
+ return None
+
+ async def get_next_item(self) -> Item:
+ self._iter += 1
+ return _forced_tool_item()
+
+ def build_task(self, item: Item) -> str:
+ return str(item.get("prompt") or "")
+
+ def build_agent_config(self, item: Item) -> AgentConfig: # noqa: ARG002
+ # Avoid imposing max_tokens by default; tool-tag responses can be long for some models.
+ return AgentConfig(
+ max_steps=min(8, int(self.config.agent_max_steps)),
+ temperature=0.2,
+ max_tokens=None,
+ system_prompt=STRICT_TOOLCALL_SYSTEM_PROMPT,
+ )
+
+ async def score_trajectory(self, item: Item, final_response: str) -> float:
+ # Scoring happens in verify_and_score_trajectory so we can inspect tool results.
+ _ = (item, final_response)
+ return 0.0
+
+ async def verify_and_score_trajectory(
+ self,
+ item: Item,
+ final_response: str,
+ *,
+ trajectory_id: str, # noqa: ARG002
+ exec_tool, # noqa: ARG002
+ agent_result: AgentResult | None = None,
+ ) -> tuple[float, Dict[str, Any]]:
+ if agent_result is None:
+ return 0.0, {"error": "Missing agent_result"}
+
+ observed: str = ""
+ tool_ok = False
+ for step in agent_result.steps:
+ for res in step.tool_results:
+ if not res.success:
+ return 0.0, {"error": res.error, "output": res.output}
+ out = (res.output or "").strip()
+ if out:
+ observed = out.splitlines()[-1].strip()
+ tool_ok = True
+
+ final = (final_response or "").strip()
+ score = 1.0 if tool_ok and agent_result.total_tool_calls > 0 and observed and final == observed else 0.0
+ return score, {"observed": observed, "tool_calls": agent_result.total_tool_calls, "command": item.get("command")}
+
+
+if __name__ == "__main__":
+ SandboxTerminalSmokeEnv.cli()
diff --git a/atropos/envs/swe_smith_oracle_env.py b/atropos/envs/swe_smith_oracle_env.py
index 756284ed2e..79f384495c 100644
--- a/atropos/envs/swe_smith_oracle_env.py
+++ b/atropos/envs/swe_smith_oracle_env.py
@@ -72,7 +72,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
os.getenv("ATROPOS_SERVER_BASE_URL")
or os.getenv("OPENAI_BASE_URL")
or os.getenv("LLM_BASE_URL")
- or "http://localhost:11434"
+ or "http://127.0.0.1:8080"
)
model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
@@ -252,6 +252,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
*,
trajectory_id: str,
exec_tool,
+ agent_result=None, # noqa: ARG002
) -> tuple[float, Dict[str, Any]]:
_ = trajectory_id
repo_dir = self._repo_name(item)
diff --git a/atropos/envs/test_env.py b/atropos/envs/test_env.py
index 8ae6a2f27c..7c7f08e011 100644
--- a/atropos/envs/test_env.py
+++ b/atropos/envs/test_env.py
@@ -61,7 +61,7 @@ class SimpleTestEnvConfig(AgentEnvConfig):
"""Configuration for the simple test environment."""
server_base_url: str = Field(
- default="http://localhost:11434",
+ default="http://127.0.0.1:8080",
description="Base URL for an OpenAI-compatible server (without /v1)",
)
server_model: str = Field(
@@ -102,7 +102,7 @@ class SimpleTestEnv(AgentEnv[SimpleTestEnvConfig]):
os.getenv("ATROPOS_SERVER_BASE_URL")
or os.getenv("OPENAI_BASE_URL")
or os.getenv("LLM_BASE_URL")
- or "http://localhost:11434"
+ or "http://127.0.0.1:8080"
)
model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
diff --git a/atropos/envs/toolserver_smoke_env.py b/atropos/envs/toolserver_smoke_env.py
new file mode 100644
index 0000000000..4b39af468d
--- /dev/null
+++ b/atropos/envs/toolserver_smoke_env.py
@@ -0,0 +1,162 @@
+"""
+ToolServer routing smoke environment.
+
+Validates that:
+ - sandbox tools run through Nomad SlotPool (terminal -> bash in sandbox)
+ - external tools run through ToolServer (skills_list)
+
+This env uses ToolServer in-process by default (`tool_server_url="inprocess"`),
+so it is self-contained for local testing.
+
+Run:
+ uv run python -m atropos.envs.toolserver_smoke_env process --env.use_wandb false --env.total_steps 1 --env.group_size 1
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, List, Tuple
+
+from dotenv import load_dotenv
+from pydantic import Field
+
+from atroposlib.envs.base import APIServerConfig, Item
+
+from ..agent import AgentConfig, AgentResult
+from .agent_env import AgentEnv, AgentEnvConfig
+
+load_dotenv()
+
+
+class ToolServerSmokeEnvConfig(AgentEnvConfig):
+ server_base_url: str = Field(
+ default="http://127.0.0.1:8080",
+ description="Base URL for an OpenAI-compatible chat server (without /v1).",
+ )
+ server_model: str = Field(default="glm-4.7-flash", description="Model name")
+
+
+class ToolServerSmokeEnv(AgentEnv[ToolServerSmokeEnvConfig]):
+ name = "toolserver_smoke_env"
+ env_config_cls = ToolServerSmokeEnvConfig
+
+ def __init__(
+ self,
+ config: ToolServerSmokeEnvConfig,
+ server_configs: List[APIServerConfig],
+ slurm: bool = False,
+ testing: bool = False,
+ ):
+ super().__init__(config, server_configs, slurm, testing)
+ self._iter = 0
+
+ @classmethod
+ def config_init(cls) -> Tuple[ToolServerSmokeEnvConfig, List[APIServerConfig]]:
+ base_url = (
+ os.getenv("ATROPOS_SERVER_BASE_URL")
+ or os.getenv("OPENAI_BASE_URL")
+ or os.getenv("LLM_BASE_URL")
+ or "http://127.0.0.1:8080"
+ )
+ model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
+ api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
+
+ env_config = ToolServerSmokeEnvConfig(
+ tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct", # tokenization only
+ group_size=1,
+ use_wandb=False,
+ include_messages=True,
+ ensure_scores_are_not_same=False,
+ total_steps=1,
+ batch_size=1,
+ server_base_url=base_url,
+ server_model=model,
+ enabled_toolsets=["terminal", "skills"],
+ disabled_toolsets=[],
+ # Self-contained ToolServer for local smoke.
+ tool_server_url="inprocess",
+ sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local",
+ purge_job_on_shutdown=True,
+ )
+
+ server_configs = [
+ APIServerConfig(
+ model_name=model,
+ base_url=f"{base_url.rstrip('/')}/v1",
+ api_key=api_key,
+ num_max_requests_at_once=1,
+ num_requests_for_eval=1,
+ timeout=120,
+ )
+ ]
+ return env_config, server_configs
+
+ async def setup_agent_env(self) -> None:
+ return None
+
+ async def get_next_item(self) -> Item:
+ self._iter += 1
+ return {
+ "prompt": (
+ "You MUST call exactly one tool per assistant message.\n"
+ "\n"
+ "Step 1) Call the skills_list tool (no arguments), then stop.\n"
+ "Step 2) After you receive the tool response, call the terminal tool to run:\n"
+ "python -c \"print('ok')\"\n"
+ "Step 3) After you receive the terminal tool response, answer with just: ok\n"
+ "\n"
+ "Tool call format requirements:\n"
+ "- Every tool call MUST be a complete XML block with a closing tag.\n"
+ "- Do NOT emit a second in the same assistant message.\n"
+ "\n"
+ "Example:\n"
+ "{\"name\": \"skills_list\", \"arguments\": {}}\n"
+ "Do not include anything else in your final answer."
+ )
+ }
+
+ def build_task(self, item: Item) -> str:
+ return str(item.get("prompt") or "")
+
+ def build_agent_config(self, item: Item) -> AgentConfig: # noqa: ARG002
+ return AgentConfig(
+ max_steps=min(10, int(self.config.agent_max_steps)),
+ temperature=0.2,
+ max_tokens=None,
+ )
+
+ async def score_trajectory(self, item: Item, final_response: str) -> float:
+ _ = (item, final_response)
+ return 0.0
+
+ async def verify_and_score_trajectory(
+ self,
+ item: Item,
+ final_response: str,
+ *,
+ trajectory_id: str, # noqa: ARG002
+ exec_tool, # noqa: ARG002
+ agent_result: AgentResult | None = None,
+ ) -> tuple[float, Dict[str, Any]]:
+ if agent_result is None:
+ return 0.0, {"error": "Missing agent_result"}
+
+ called = {c.name for s in agent_result.steps for c in s.tool_calls}
+ need = {"skills_list", "terminal"}
+ if not need.issubset(called):
+ return 0.0, {"error": f"Missing tool calls: {sorted(need - called)}", "called": sorted(called)}
+
+ terminal_ok = False
+ for step in agent_result.steps:
+ for call, res in zip(step.tool_calls, step.tool_results):
+ if call.name != "terminal":
+ continue
+ if res.success and (res.output or "").strip().splitlines()[-1].strip() == "ok":
+ terminal_ok = True
+
+ score = 1.0 if terminal_ok and (final_response or "").strip() == "ok" else 0.0
+ return score, {"called": sorted(called), "final": (final_response or "").strip()}
+
+
+if __name__ == "__main__":
+ ToolServerSmokeEnv.cli()
diff --git a/atropos/slots/pool.py b/atropos/slots/pool.py
index ba7eb683b3..d6ace16b7d 100644
--- a/atropos/slots/pool.py
+++ b/atropos/slots/pool.py
@@ -138,42 +138,47 @@ class SlotPool:
return
logger.info(f"Starting SlotPool (job_id={self.config.job_id})")
-
- # Check Nomad health
- if not await self.nomad.is_healthy():
- raise RuntimeError(f"Nomad is not reachable at {self.config.nomad_address}")
-
- # Check if job exists
- job = await self.nomad.get_job(self.config.job_id)
-
- if job is None:
- # Deploy new job
- logger.info(f"Deploying sandbox job: {self.config.job_id}")
- job_spec = create_sandbox_job(
- job_id=self.config.job_id,
- image=self.config.image,
- count=self.config.min_containers,
- slots_per_container=self.config.slots_per_container,
- privileged=self.config.privileged,
- cpu=self.config.cpu,
- memory=self.config.memory,
- datacenter=self.config.datacenter,
- )
- result = await self.nomad.submit_job(job_spec)
- if "error" in result:
- raise RuntimeError(f"Failed to submit job: {result}")
-
- # Wait for allocations to be running
+
+ try:
+ # Check Nomad health
+ if not await self.nomad.is_healthy():
+ raise RuntimeError(f"Nomad is not reachable at {self.config.nomad_address}")
+
+ # Check if job exists
+ job = await self.nomad.get_job(self.config.job_id)
+
+ if job is None:
+ # Deploy new job
+ logger.info(f"Deploying sandbox job: {self.config.job_id}")
+ job_spec = create_sandbox_job(
+ job_id=self.config.job_id,
+ image=self.config.image,
+ count=self.config.min_containers,
+ slots_per_container=self.config.slots_per_container,
+ privileged=self.config.privileged,
+ cpu=self.config.cpu,
+ memory=self.config.memory,
+ datacenter=self.config.datacenter,
+ )
+ result = await self.nomad.submit_job(job_spec)
+ if "error" in result:
+ raise RuntimeError(f"Failed to submit job: {result}")
+
+ # Wait for allocations to be running (even if the job already existed).
await self._wait_for_healthy_allocations(self.config.min_containers)
-
- # Discover existing allocations and slots
- await self._refresh_slots()
-
- # Start health check task
- self._health_task = asyncio.create_task(self._health_check_loop())
-
- self._started = True
- logger.info(f"SlotPool started: {self.total_slots} slots available")
+
+ # Discover existing allocations and slots
+ await self._refresh_slots()
+
+ # Start health check task
+ self._health_task = asyncio.create_task(self._health_check_loop())
+
+ self._started = True
+ logger.info(f"SlotPool started: {self.total_slots} slots available")
+ except Exception:
+ # Ensure aiohttp sessions are not leaked if we fail to start.
+ await self.stop(purge_job=False)
+ raise
async def stop(self, purge_job: bool = False) -> None:
"""
@@ -384,6 +389,19 @@ class SlotPool:
"""Wait for allocations to become healthy."""
import time
start = time.time()
+
+ def _summarize_alloc_detail(detail: Dict[str, Any]) -> str:
+ task_states = detail.get("TaskStates") or {}
+ parts: List[str] = []
+ if isinstance(task_states, dict):
+ for task_name, st in task_states.items():
+ events = (st or {}).get("Events") or []
+ if isinstance(events, list) and events:
+ last = events[-1]
+ desc = last.get("DisplayMessage") or last.get("Message") or last.get("Type") or ""
+ if desc:
+ parts.append(f"{task_name}: {desc}")
+ return "; ".join(parts)
while time.time() - start < timeout:
allocs = await self.nomad.get_job_allocations(self.config.job_id)
@@ -393,13 +411,45 @@ class SlotPool:
if alloc.status == AllocationStatus.RUNNING and alloc.http_address:
if await self.executor.health_check(alloc.http_address):
healthy_count += 1
+
+ # Fast-fail on obvious driver/image errors to avoid waiting out the full timeout.
+ if alloc.id:
+ detail = await self.nomad.get_allocation(alloc.id)
+ if isinstance(detail, dict):
+ summary = _summarize_alloc_detail(detail)
+ lowered = summary.lower()
+ if "failed to pull" in lowered or "pull access denied" in lowered:
+ raise RuntimeError(
+ "Nomad allocation failed to start due to a Docker image pull error. "
+ f"Allocation {alloc.id[:8]}: {summary}\n"
+ "If you're using a local image tag (e.g. `atropos-sandbox:local`) on macOS, "
+ "make sure the image is loaded into Docker (build with `docker buildx build --load ...`)."
+ )
if healthy_count >= min_count:
return
await asyncio.sleep(2.0)
-
- raise RuntimeError(f"Timed out waiting for {min_count} healthy allocations")
+
+ # Timed out: include allocation status detail to help debugging.
+ allocs = await self.nomad.get_job_allocations(self.config.job_id)
+ alloc_lines: List[str] = []
+ for alloc in allocs[:10]:
+ addr = alloc.http_address or "-"
+ line = f"{alloc.id[:8]} status={alloc.status.value} http={addr}"
+ detail = await self.nomad.get_allocation(alloc.id)
+ if isinstance(detail, dict):
+ summary = _summarize_alloc_detail(detail)
+ if summary:
+ line += f" detail={summary}"
+ alloc_lines.append(line)
+
+ hint = (
+ "Timed out waiting for healthy sandbox allocations.\n"
+ f"Job: {self.config.job_id}, desired_healthy: {min_count}\n"
+ "Allocations:\n - " + "\n - ".join(alloc_lines)
+ )
+ raise RuntimeError(hint)
async def _try_scale_up(self) -> bool:
"""Attempt to scale up the job."""
diff --git a/atropos/tools/base.py b/atropos/tools/base.py
index 33c9d1017a..a27b8f1ad1 100644
--- a/atropos/tools/base.py
+++ b/atropos/tools/base.py
@@ -72,26 +72,65 @@ class ToolCall:
"""
Extract tool calls from text using Hermes-style XML tags.
- Format: {"name": "...", "arguments": {...}}
+ Supported formats (STRICT: requires well-formed closing tags):
+ - Hermes JSON wrapper:
+ {"name": "...", "arguments": {...}}
+ - GLM/llama.cpp style:
+ terminal{"command":"ls -la"}
"""
- calls = []
+ calls: List["ToolCall"] = []
+
+ if not text:
+ return calls
+
+ def _append_from_payload(*, name: str, arguments: Dict[str, Any], raw: str, uniq_id: Optional[str] = None) -> None:
+ if not isinstance(name, str) or not name:
+ return
+ if not isinstance(arguments, dict):
+ return
+ calls.append(
+ cls(
+ name=name,
+ arguments=arguments,
+ raw_text=raw,
+ uniq_id=uniq_id or str(uuid.uuid4()),
+ )
+ )
+
+ # STRICT parsing: only accept well-formed ... blocks.
pattern = r"\s*(.*?)\s*"
- matches = re.findall(pattern, text, re.DOTALL)
-
- for match in matches:
- try:
- data = json.loads(match)
- uniq_id = data.get("uniq_id") or data.get("id") or str(uuid.uuid4())
- calls.append(cls(
+ for inner in re.findall(pattern, text, re.DOTALL):
+ cleaned = (inner or "").strip()
+ if not cleaned:
+ continue
+
+ # Hermes JSON wrapper.
+ if cleaned.startswith("{"):
+ try:
+ data = json.loads(cleaned)
+ except json.JSONDecodeError:
+ continue
+ uniq_id = data.get("uniq_id") or data.get("id") or None
+ _append_from_payload(
name=data.get("name", ""),
arguments=data.get("arguments", {}),
- raw_text=match,
+ raw=inner,
uniq_id=uniq_id,
- ))
- except json.JSONDecodeError:
- # Skip malformed tool calls
+ )
continue
-
+
+ # GLM/llama.cpp style: terminal{...}
+ m = re.match(r"^\s*([A-Za-z0-9_.:\\-]+)\s*(\{.*\})\s*$", cleaned, re.DOTALL)
+ if not m:
+ continue
+ name = m.group(1)
+ args_text = m.group(2)
+ try:
+ args = json.loads(args_text)
+ except json.JSONDecodeError:
+ continue
+ _append_from_payload(name=name, arguments=args, raw=inner)
+
return calls
@classmethod
@@ -208,6 +247,27 @@ class ToolRegistry:
"""Generate tool descriptions for system prompt."""
descriptions = [tool.schema.to_prompt_description() for tool in self._tools.values()]
return "\n\n".join(descriptions)
+
+ def get_prompt_tool_definitions_json(self) -> str:
+ """
+ Return a Hermes-style JSON list of tool definitions for use inside a `...` block.
+
+ Hermes trajectories historically use a simplified schema list:
+ [{"name": ..., "description": ..., "parameters": {...}, "required": null}, ...]
+ """
+ formatted: List[Dict[str, Any]] = []
+ for tool in self._tools.values():
+ fn = tool.schema.to_dict().get("function", {})
+ formatted.append(
+ {
+ "name": fn.get("name", tool.name),
+ "description": fn.get("description", ""),
+ "parameters": fn.get("parameters", {}),
+ # Keep parity with Hermes saved trajectories (required is typically null there).
+ "required": None,
+ }
+ )
+ return json.dumps(formatted, ensure_ascii=False)
async def execute(self, call: ToolCall) -> ToolResult:
"""Execute a tool call."""
diff --git a/hermes_agent.egg-info/PKG-INFO b/hermes_agent.egg-info/PKG-INFO
index 98fb7f6d68..d17e541623 100644
--- a/hermes_agent.egg-info/PKG-INFO
+++ b/hermes_agent.egg-info/PKG-INFO
@@ -29,7 +29,7 @@ Provides-Extra: dev
Requires-Dist: pytest; extra == "dev"
Requires-Dist: pytest-asyncio; extra == "dev"
Provides-Extra: atropos
-Requires-Dist: atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git ; extra == "atropos"
+Requires-Dist: atroposlib @ git+https://github.com/NousResearch/atropos.git ; extra == "atropos"
Requires-Dist: aiohttp; extra == "atropos"
Requires-Dist: fastapi; extra == "atropos"
Requires-Dist: uvicorn; extra == "atropos"
diff --git a/hermes_agent.egg-info/SOURCES.txt b/hermes_agent.egg-info/SOURCES.txt
index 3464034f41..87e0cd7d93 100644
--- a/hermes_agent.egg-info/SOURCES.txt
+++ b/hermes_agent.egg-info/SOURCES.txt
@@ -17,9 +17,10 @@ atropos/api/tool_executor_server.py
atropos/api/tool_server.py
atropos/envs/__init__.py
atropos/envs/agent_env.py
-atropos/envs/hermes_compat_test_env.py
+atropos/envs/sandbox_terminal_smoke_env.py
atropos/envs/swe_smith_oracle_env.py
atropos/envs/test_env.py
+atropos/envs/toolserver_smoke_env.py
atropos/nomad/__init__.py
atropos/nomad/client.py
atropos/slots/__init__.py
@@ -30,18 +31,13 @@ atropos/terminal/__init__.py
atropos/terminal/asciinema_stream.py
atropos/tools/__init__.py
atropos/tools/base.py
-atropos/tools/basic_tools.py
-atropos/tools/image_generation_tool.py
-atropos/tools/mixture_of_agents_tool.py
-atropos/tools/terminal_hecate.py
+atropos/tools/build_registry.py
+atropos/tools/hermes_external_tools.py
+atropos/tools/sandbox_stubs.py
atropos/tools/terminal_stateful_tool.py
-atropos/tools/terminal_tool.py
atropos/tools/tmux_tool.py
atropos/tools/tool_executor.py
-atropos/tools/toolset_distributions.py
-atropos/tools/toolsets.py
-atropos/tools/vision_tools.py
-atropos/tools/web_tools.py
+atropos/tools/toolset_resolver.py
hermes_agent.egg-info/PKG-INFO
hermes_agent.egg-info/SOURCES.txt
hermes_agent.egg-info/dependency_links.txt
diff --git a/hermes_agent.egg-info/entry_points.txt b/hermes_agent.egg-info/entry_points.txt
index 2e72c8e210..42fd7548f8 100644
--- a/hermes_agent.egg-info/entry_points.txt
+++ b/hermes_agent.egg-info/entry_points.txt
@@ -1,2 +1,4 @@
[console_scripts]
hermes-agent = run_agent:main
+hermes-atropos-sandbox-smoke = atropos.envs.sandbox_terminal_smoke_env:SandboxTerminalSmokeEnv.cli
+hermes-atropos-toolserver-smoke = atropos.envs.toolserver_smoke_env:ToolServerSmokeEnv.cli
diff --git a/hermes_agent.egg-info/requires.txt b/hermes_agent.egg-info/requires.txt
index 0ef7437426..6e9ad30e5a 100644
--- a/hermes_agent.egg-info/requires.txt
+++ b/hermes_agent.egg-info/requires.txt
@@ -16,7 +16,7 @@ typer
platformdirs
[atropos]
-atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git
+atroposlib @ git+https://github.com/NousResearch/atropos.git
aiohttp
fastapi
uvicorn
diff --git a/pyproject.toml b/pyproject.toml
index aa78046db3..328a16a0fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ modal = ["modal", "boto3"]
dev = ["pytest", "pytest-asyncio"]
# Install Atropos from source (PyPI is often stale for this internal dependency).
atropos = [
- "atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git",
+ "atroposlib @ git+https://github.com/NousResearch/atropos.git",
# Atropos integration runtime deps (kept optional for Hermes-only users)
"aiohttp",
"fastapi",
@@ -47,6 +47,8 @@ atropos = [
[project.scripts]
hermes-agent = "run_agent:main"
+hermes-atropos-sandbox-smoke = "atropos.envs.sandbox_terminal_smoke_env:SandboxTerminalSmokeEnv.cli"
+hermes-atropos-toolserver-smoke = "atropos.envs.toolserver_smoke_env:ToolServerSmokeEnv.cli"
[tool.setuptools]
py-modules = [
diff --git a/scripts/launch_llama_cpp_glm47_flash.sh b/scripts/launch_llama_cpp_glm47_flash.sh
new file mode 100755
index 0000000000..3f3716a2fa
--- /dev/null
+++ b/scripts/launch_llama_cpp_glm47_flash.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Launch a local llama.cpp OpenAI-compatible server running GLM-4.7-Flash (GGUF).
+#
+# Requires:
+# - `llama-server` installed (e.g. `brew install llama.cpp`)
+#
+# Default settings are chosen to avoid clashing with Atropos sandbox_server
+# (which commonly uses port 8080 in local dev).
+#
+# Usage:
+# Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh
+#
+# Override defaults:
+# LLAMA_CPP_HOST=127.0.0.1 LLAMA_CPP_PORT=8082 \
+# LLAMA_CPP_HF_REPO=ggml-org/GLM-4.7-Flash-GGUF \
+# LLAMA_CPP_HF_FILE=GLM-4.7-Flash-Q4_K.gguf \
+# Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh
+
+HOST="${LLAMA_CPP_HOST:-127.0.0.1}"
+PORT="${LLAMA_CPP_PORT:-8080}"
+HF_REPO="${LLAMA_CPP_HF_REPO:-ggml-org/GLM-4.7-Flash-GGUF}"
+HF_FILE="${LLAMA_CPP_HF_FILE:-GLM-4.7-Flash-Q4_K.gguf}"
+ALIAS="${LLAMA_CPP_ALIAS:-glm-4.7-flash}"
+
+if ! command -v llama-server >/dev/null 2>&1; then
+ echo "Error: llama-server not found in PATH."
+ echo "Install via Homebrew: brew install llama.cpp"
+ exit 1
+fi
+
+echo "Launching llama.cpp server..."
+echo " host: $HOST"
+echo " port: $PORT"
+echo " repo: $HF_REPO"
+echo " file: $HF_FILE"
+echo " alias: $ALIAS"
+echo
+echo "Suggested env vars for Hermes/Atropos integration:"
+echo " export ATROPOS_SERVER_BASE_URL=http://${HOST}:${PORT}"
+echo " export ATROPOS_SERVER_MODEL=${ALIAS}"
+echo " export ATROPOS_SERVER_API_KEY=local"
+echo
+
+if command -v lsof >/dev/null 2>&1; then
+ if lsof -nP -iTCP:"$PORT" -sTCP:LISTEN >/dev/null 2>&1; then
+ echo "Error: port $PORT is already in use."
+ echo "Pick a different port, e.g.:"
+ echo " LLAMA_CPP_PORT=8082 Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh"
+ exit 1
+ fi
+fi
+
+exec llama-server \
+ --host "$HOST" \
+ --port "$PORT" \
+ --hf-repo "$HF_REPO" \
+ --hf-file "$HF_FILE" \
+ --alias "$ALIAS" \
+ -c 32768 \
+ -n -1
diff --git a/tests/test_tool_call_parsing.py b/tests/test_tool_call_parsing.py
new file mode 100644
index 0000000000..49db864b24
--- /dev/null
+++ b/tests/test_tool_call_parsing.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from atropos.tools.base import ToolCall
+
+
+def test_parse_tool_call_json_wrapper() -> None:
+ text = '{"name":"terminal","arguments":{"command":"pwd"}}'
+ calls = ToolCall.parse_from_text(text)
+ assert len(calls) == 1
+ assert calls[0].name == "terminal"
+ assert calls[0].arguments == {"command": "pwd"}
+
+
+def test_parse_tool_call_glm_style() -> None:
+ text = 'terminal{"command":"ls -la"}'
+ calls = ToolCall.parse_from_text(text)
+ assert len(calls) == 1
+ assert calls[0].name == "terminal"
+ assert calls[0].arguments == {"command": "ls -la"}
+
+
+def test_parse_tool_call_missing_close_tag() -> None:
+ text = 'terminal{"command":"echo hi"}'
+ calls = ToolCall.parse_from_text(text)
+ assert calls == []
+
+
+def test_parse_tool_call_strips_accidental_xml() -> None:
+ text = 'terminal{"command":"ls -la"}'
+ calls = ToolCall.parse_from_text(text)
+ assert calls == []
diff --git a/uv.lock b/uv.lock
index 52eaaccbe0..0cb6f730fc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -218,7 +218,7 @@ wheels = [
[[package]]
name = "atroposlib"
version = "0.3.0"
-source = { git = "ssh://git@github.com/NousResearch/atropos.git#462abbebf75f44e811116c3730ce9874c4358a80" }
+source = { git = "https://github.com/NousResearch/atropos.git#462abbebf75f44e811116c3730ce9874c4358a80" }
dependencies = [
{ name = "aiofiles" },
{ name = "aiohttp" },
@@ -913,7 +913,7 @@ modal = [
[package.metadata]
requires-dist = [
{ name = "aiohttp", marker = "extra == 'atropos'" },
- { name = "atroposlib", marker = "extra == 'atropos'", git = "ssh://git@github.com/NousResearch/atropos.git" },
+ { name = "atroposlib", marker = "extra == 'atropos'", git = "https://github.com/NousResearch/atropos.git" },
{ name = "boto3", marker = "extra == 'modal'" },
{ name = "fal-client" },
{ name = "fastapi", marker = "extra == 'atropos'" },