From 16fb41f9cc77b70762aaeef42b0e1efd16b2ad6f Mon Sep 17 00:00:00 2001
From: Shannon Sands <shannon.sands.1979@gmail.com>
Date: Tue, 3 Feb 2026 11:41:34 +1000
Subject: [PATCH] smokes working, fixing up toolserver. switched to llama.cpp,
 ollama sucks too much

---
 .env.example                               |   5 +-
 atropos/agent/atropos_agent.py             | 168 ++++++++++++-----
 atropos/envs/agent_env.py                  |  11 +-
 atropos/envs/hermes_compat_test_env.py     | 203 ++++++++-------------
 atropos/envs/sandbox_terminal_smoke_env.py | 169 +++++++++++++++++
 atropos/envs/swe_smith_oracle_env.py       |   3 +-
 atropos/envs/test_env.py                   |   4 +-
 atropos/envs/toolserver_smoke_env.py       | 162 ++++++++++++++++
 atropos/slots/pool.py                      | 124 +++++++++----
 atropos/tools/base.py                      |  88 +++++++--
 hermes_agent.egg-info/PKG-INFO             |   2 +-
 hermes_agent.egg-info/SOURCES.txt          |  16 +-
 hermes_agent.egg-info/entry_points.txt     |   2 +
 hermes_agent.egg-info/requires.txt         |   2 +-
 pyproject.toml                             |   4 +-
 scripts/launch_llama_cpp_glm47_flash.sh    |  62 +++++++
 tests/test_tool_call_parsing.py            |  31 ++++
 uv.lock                                    |   4 +-
 18 files changed, 822 insertions(+), 238 deletions(-)
 create mode 100644 atropos/envs/sandbox_terminal_smoke_env.py
 create mode 100644 atropos/envs/toolserver_smoke_env.py
 create mode 100755 scripts/launch_llama_cpp_glm47_flash.sh
 create mode 100644 tests/test_tool_call_parsing.py
diff --git a/.env.example b/.env.example
index 4c13afec63..1f2cba1da5 100644
--- a/.env.example
+++ b/.env.example
@@ -22,12 +22,13 @@ HERMES_BACKEND=openai
 # of OpenRouter.
 #
 # Local server convenience (base URL without /v1):
-# ATROPOS_SERVER_BASE_URL=http://localhost:11434
+# llama.cpp example (see `Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh`):
+# ATROPOS_SERVER_BASE_URL=http://127.0.0.1:8080
 # ATROPOS_SERVER_MODEL=glm-4.7-flash
 # ATROPOS_SERVER_API_KEY=local
 #
 # Generic OpenAI-compatible (base URL should include /v1):
-# OPENAI_BASE_URL=http://localhost:11434/v1
+# OPENAI_BASE_URL=http://127.0.0.1:8080/v1
 # OPENAI_API_KEY=local
 
 # =============================================================================
diff --git a/atropos/agent/atropos_agent.py b/atropos/agent/atropos_agent.py
index 9ea6e3044e..2fb1de6d68 100644
--- a/atropos/agent/atropos_agent.py
+++ b/atropos/agent/atropos_agent.py
@@ -15,6 +15,7 @@ The agent uses Hermes-style XML tags for tool calls:
 
 import asyncio
 import os
+import json
 from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
 from typing import Any, AsyncGenerator, Awaitable, Callable, Dict, List, Optional, Union
@@ -27,33 +28,66 @@ from atroposlib.envs.server_handling.managed_server import ManagedServer
 load_dotenv()
 
 
-# Default system prompt with tool calling instructions
-AGENT_SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. You can use tools to accomplish tasks.
+# Default system prompt with tool calling instructions.
+#
+# IMPORTANT: In training-mode environments we want "raw text in -> raw text out" and we
+# parse tool calls from completion text. Do not rely on server-specific `tool_calls` fields.
+AGENT_SYSTEM_PROMPT = """You are a function-calling AI model.
 
-## Available Tools
+You are provided with function signatures within <tools></tools> XML tags.
+You may call one or more functions to assist with the user query. If available tools are not relevant,
+respond in natural language.
+
+After calling & executing a function, you will be provided with function results within
+<tool_response></tool_response> XML tags.
+
+Here are the available tools:
 <tools>
-{tool_descriptions}
+{tools_json}
 </tools>
 
-## How to Use Tools
-To use a tool, output a tool call in the following format:
-<tool_call>{{"name": "tool_name", "arguments": {{"arg1": "value1", "arg2": "value2"}}}}</tool_call>
+## REQUIRED TOOL FORMAT
 
-You may reason about what to do before calling a tool:
-<think>I need to check what files are in the current directory...</think>
-<tool_call>{{"name": "bash", "arguments": {{"command": "ls -la"}}}}</tool_call>
+When you decide to call a tool, your assistant message MUST be:
+1) exactly one <think>...</think> block, followed by
+2) one or more <tool_call>...</tool_call> blocks,
+and NOTHING else in that message.
 
-After a tool is executed, you will receive the result:
-<tool_response>{{"success": true, "output": "..."}}</tool_response>
+For each tool call, output a JSON object with this schema:
+{"name": "function_name", "arguments": { ... }}
 
-Continue using tools as needed until you have completed the task.
-When you have finished, provide your final response without any tool calls.
+Each tool call MUST be enclosed within <tool_call></tool_call> XML tags.
+The JSON inside <tool_call> MUST be valid JSON with double quotes.
 
-## Important Guidelines
-- Think step by step about what you need to do
-- Use tools to gather information and perform actions
-- If a tool call fails, analyze the error and try a different approach
-- Provide clear, concise responses when the task is complete
+Do NOT output <tool_response> in an assistant message.
+
+After you receive tool results, you may either call more tools (same required format) or provide the final answer.
+When providing the final answer, do NOT include any <tool_call> blocks.
+
+## ICL (examples)
+
+User: Show the current directory.
+Assistant:
+<think>I should use the terminal tool to print the current directory.</think>
+<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>
+User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
+Assistant: /tmp
+
+User: List files, then count them.
+Assistant:
+<think>I should list files and count lines.</think>
+<tool_call>{"name": "terminal", "arguments": {"command": "ls -1 | wc -l"}}</tool_call>
+User: <tool_response>{"success": true, "output": "3\\n"}</tool_response>
+Assistant: 3
+
+User: Run pwd, then print ok.
+Assistant:
+<think>I should run pwd, then run a command that prints ok.</think>
+<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>
+<tool_call>{"name": "terminal", "arguments": {"command": "echo ok"}}</tool_call>
+User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
+User: <tool_response>{"success": true, "output": "ok\\n"}</tool_response>
+Assistant: ok
 """
 
 
@@ -62,8 +96,9 @@ class AgentConfig:
     """Configuration for the AtroposAgent."""
     
     # Generation parameters
-    temperature: float = 0.7
-    max_tokens: int = 4096
+    temperature: Optional[float] = 0.7
+    # Default to "let the backend decide" (important for tool-tag completions that may be longer).
+    max_tokens: Optional[int] = None
     
     # Agent behavior
     max_steps: int = 50
@@ -222,13 +257,53 @@ class AtroposAgent:
         """Build the system prompt with tool descriptions."""
         if self.config.system_prompt:
             return self.config.system_prompt
-        
-        tool_descriptions = self.tools.get_prompt_description()
-        if not tool_descriptions:
-            tool_descriptions = "(No tools available)"
-        
-        return AGENT_SYSTEM_PROMPT.format(tool_descriptions=tool_descriptions)
-    
+
+        tools_json = self.tools.get_prompt_tool_definitions_json()
+        # Avoid `str.format()` here because the prompt contains many literal `{}` braces
+        # in JSON examples; we only want to substitute the single `{tools_json}` token.
+        return AGENT_SYSTEM_PROMPT.replace("{tools_json}", tools_json)
+
+    def _debug_dump_request(self, *, step_num: int, chat_kwargs: Dict[str, Any]) -> None:
+        if os.getenv("ATROPOS_DEBUG_AGENT_REQUEST") != "1":
+            return
+        try:
+            # Avoid dumping megabytes by default; messages can be huge.
+            meta = {
+                "step": step_num,
+                "chat_kwargs_keys": sorted(list(chat_kwargs.keys())),
+                "n": chat_kwargs.get("n"),
+                "max_tokens": chat_kwargs.get("max_tokens"),
+                "temperature": chat_kwargs.get("temperature"),
+                "num_messages": len(chat_kwargs.get("messages") or []),
+            }
+            print("\n=== ATROPOS_DEBUG_AGENT_REQUEST ===", flush=True)
+            print(meta, flush=True)
+
+            if os.getenv("ATROPOS_DEBUG_AGENT_REQUEST_FULL") == "1":
+                payload = dict(chat_kwargs)
+                # Make the payload more legible and less huge.
+                try:
+                    dumped = json.dumps(payload, ensure_ascii=False, indent=2)
+                except Exception:
+                    dumped = repr(payload)
+                print("\n=== ATROPOS_DEBUG_AGENT_REQUEST_FULL ===", flush=True)
+                print(dumped[:200_000], flush=True)
+        except Exception:
+            return
+
+    def _debug_dump_response(self, *, step_num: int, response: Any) -> None:
+        if os.getenv("ATROPOS_DEBUG_AGENT_RESPONSE") != "1":
+            return
+        print("\n=== ATROPOS_DEBUG_AGENT_RESPONSE ===", flush=True)
+        print({"step": step_num, "type": type(response).__name__}, flush=True)
+        try:
+            dumped = response.model_dump()  # openai pydantic model
+        except Exception:
+            dumped = getattr(response, "__dict__", {"repr": repr(response)})
+        # Keep the dump bounded; we only need enough to see the assistant message content.
+        text = str(dumped)
+        print(text[:200_000], flush=True)
+
     async def run(
         self,
         task: str,
@@ -265,12 +340,15 @@ class AtroposAgent:
                     # Keep a copy of the prompt messages used for this completion.
                     # Useful for reconstructing tokens/masks when state tracking is unavailable.
                     prompt_messages = list(messages)
-                    response = await managed.chat_completion(
-                        messages=messages,
-                        n=1,
-                        max_tokens=self.config.max_tokens,
-                        temperature=self.config.temperature,
-                    )
+                    chat_kwargs: Dict[str, Any] = {"messages": messages, "n": 1}
+                    if self.config.max_tokens is not None:
+                        chat_kwargs["max_tokens"] = self.config.max_tokens
+                    if self.config.temperature is not None:
+                        chat_kwargs["temperature"] = self.config.temperature
+
+                    self._debug_dump_request(step_num=step_num + 1, chat_kwargs=chat_kwargs)
+                    response = await managed.chat_completion(**chat_kwargs)
+                    self._debug_dump_response(step_num=step_num + 1, response=response)
                     
                     current_node = None
                     if hasattr(managed, "get_state"):
@@ -286,7 +364,9 @@ class AtroposAgent:
                         error=f"Generation error: {str(e)}",
                     )
                 
-                response_text = response.choices[0].message.content or ""
+                msg = response.choices[0].message
+                # Some OpenAI-compatible servers populate `message.reasoning` and leave `content=""`.
+                response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "")
                 tool_calls = ToolCall.parse_from_text(response_text)
                 
                 step = AgentStep(
@@ -380,12 +460,15 @@ class AtroposAgent:
             Tuple of (response_text, tool_results, sequence_data)
         """
         async with self._managed() as managed:
-            response = await managed.chat_completion(
-                messages=messages,
-                n=1,
-                max_tokens=self.config.max_tokens,
-                temperature=self.config.temperature,
-            )
+            chat_kwargs: Dict[str, Any] = {"messages": messages, "n": 1}
+            if self.config.max_tokens is not None:
+                chat_kwargs["max_tokens"] = self.config.max_tokens
+            if self.config.temperature is not None:
+                chat_kwargs["temperature"] = self.config.temperature
+
+            self._debug_dump_request(step_num=1, chat_kwargs=chat_kwargs)
+            response = await managed.chat_completion(**chat_kwargs)
+            self._debug_dump_response(step_num=1, response=response)
             
             current_node = None
             if hasattr(managed, "get_state"):
@@ -393,7 +476,8 @@ class AtroposAgent:
                 nodes = state.get("nodes", [])
                 current_node = nodes[-1] if nodes else None
         
-        response_text = response.choices[0].message.content or ""
+        msg = response.choices[0].message
+        response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "")
         tool_results = []
         
         if execute_tools:
diff --git a/atropos/envs/agent_env.py b/atropos/envs/agent_env.py
index 0e88a22237..789ea88b7c 100644
--- a/atropos/envs/agent_env.py
+++ b/atropos/envs/agent_env.py
@@ -18,7 +18,7 @@ from pydantic import Field
 
 from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig, Item, ScoredDataGroup, ScoredDataItem
 
-from ..agent import AgentConfig, AtroposAgent
+from ..agent import AgentConfig, AgentResult, AtroposAgent
 from ..slots import SlotPool, SlotPoolConfig
 from ..tools import ToolRegistry, build_tool_registry
 from ..tools.tool_executor import ToolExecutor, ToolExecutorConfig
@@ -56,7 +56,10 @@ class AgentEnvConfig(BaseEnvConfig):
     # basic agent defaults
     agent_max_steps: int = Field(default=50, description="Max ReACT steps per trajectory")
     agent_temperature: float = Field(default=0.7, description="Sampling temperature")
-    agent_max_tokens: int = Field(default=4096, description="Max tokens per model response")
+    agent_max_tokens: Optional[int] = Field(
+        default=None,
+        description="Max tokens per model response (default: let backend decide)",
+    )
     agent_tool_delay_s: float = Field(default=0.0, description="Delay between tool calls (seconds)")
 
     # tool selection
@@ -143,6 +146,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
         *,
         trajectory_id: str,
         exec_tool: Callable[["ToolCall"], Awaitable["ToolResult"]],
+        agent_result: Optional[AgentResult] = None,
     ) -> tuple[float, Dict[str, Any]]:
         """
         Optional hook: run in-sandbox verification before scoring.
@@ -152,7 +156,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
 
         Default: calls `score_trajectory()` and returns empty metadata.
         """
-        _ = (trajectory_id, exec_tool)  # default ignores in-workspace verification
+        _ = (trajectory_id, exec_tool, agent_result)  # default ignores in-workspace verification
         score = await self.score_trajectory(item, final_response)
         return score, {}
 
@@ -299,6 +303,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
                 result.final_response,
                 trajectory_id=trajectory_id,
                 exec_tool=_exec,
+                agent_result=result,
             )
 
             messages = [{"role": "system", "content": agent._build_system_prompt()}]  # noqa: SLF001
diff --git a/atropos/envs/hermes_compat_test_env.py b/atropos/envs/hermes_compat_test_env.py
index 270b507f83..93b0fe2dd9 100644
--- a/atropos/envs/hermes_compat_test_env.py
+++ b/atropos/envs/hermes_compat_test_env.py
@@ -1,70 +1,63 @@
 """
-Hermes-Agent (Atropos-compatible) smoke environment.
+Hermes-Agent + Atropos (Nomad sandbox) compatibility smoke environment.
 
-This is a minimal `BaseEnv` environment that uses Hermes-Agent's Atropos-backed
-runner (`AtroposAIAgent`) and can be exercised via `BaseEnv`'s `process` mode.
+This environment is intended to validate, end-to-end:
+  BaseEnv.process -> AgentEnv -> ToolExecutor (batched) -> Nomad SlotPool -> sandbox_server
 
-This deliberately does NOT use slot multiplexing / sandboxes yet (stage 1).
+It forces the model to use a sandbox tool by asking it to run a command that
+generates a high-entropy token inside the sandbox, then repeat it exactly.
+
+Run (process mode):
+  uv run python -m atropos.envs.hermes_compat_test_env process --env.use_wandb false --env.total_steps 2 --env.group_size 1
 """
 
 from __future__ import annotations
 
-import json
 import os
-import uuid
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 from dotenv import load_dotenv
 from pydantic import Field
 
-from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig, Item
+from atroposlib.envs.base import APIServerConfig, Item
+
+from ..agent import AgentConfig, AgentResult
+from ..tools import ToolCall
+from .agent_env import AgentEnv, AgentEnvConfig
 
 load_dotenv()
 
 
-def _build_forced_tool_item() -> Item:
-    """
-    Construct a task that *cannot* be completed reliably without executing a tool.
-
-    We generate a high-entropy token *inside the tool execution* and ask the agent to
-    repeat it exactly. Scoring verifies that:
-      - a terminal tool call occurred (role="tool" message present), and
-      - the final answer matches the tool stdout exactly.
-    """
+def _forced_tool_item() -> Item:
+    # Use double quotes in the shell command and show JSON escaping explicitly.
+    # This avoids invalid JSON escapes like `\\'` (not valid JSON) that some models produce.
+    cmd = 'python -c "import secrets; print(secrets.token_hex(16))"'
     return {
-        "command": "python -c \"import secrets; print(secrets.token_hex(16))\"",
+        "command": cmd,
         "prompt": (
-            "Use the terminal tool to run:\n"
-            "python -c \"import secrets; print(secrets.token_hex(16))\"\n"
-            "Then answer with EXACTLY what it printed and nothing else."
+            "You are acting as an agent inside a sandboxed environment.\n"
+            "You MUST use the terminal tool to execute commands.\n"
+            "Run this exact command:\n"
+            f"{cmd}\n"
+            "When you call the tool, use valid JSON inside <tool_call>. Example:\n"
+            '<tool_call>{"name": "terminal", "arguments": {"command": '
+            '"python -c \\\\"import secrets; print(secrets.token_hex(16))\\\\""}}'
+            "</tool_call>\n"
+            "Then respond with EXACTLY what it printed (the hex token) and nothing else.\n"
+            "Do not guess. Do not explain."
         ),
     }
 
 
-TEST_ITEMS: List[Item] = [
-    _build_forced_tool_item(),
-    _build_forced_tool_item(),
-]
-
-
-class HermesCompatTestEnvConfig(BaseEnvConfig):
-    """Config for HermesCompatTestEnv."""
-
+class HermesCompatTestEnvConfig(AgentEnvConfig):
     server_base_url: str = Field(
-        default="http://localhost:11434",
+        default="http://127.0.0.1:8080",
         description="Base URL for an OpenAI-compatible chat server (without /v1).",
     )
     server_model: str = Field(default="glm-4.7-flash", description="Model name")
 
 
-class HermesCompatTestEnv(BaseEnv):
-    """
-    Minimal BaseEnv that runs Hermes-Agent's Atropos-compatible agent loop.
-
-    Run (process mode):
-      uv run atropos-agent-hermes-compat-test process --env.use_wandb false --env.total_steps 2 --env.group_size 1
-    """
-
+class HermesCompatTestEnv(AgentEnv[HermesCompatTestEnvConfig]):
     name = "hermes_compat_test_env"
     env_config_cls = HermesCompatTestEnvConfig
 
@@ -75,39 +68,22 @@ class HermesCompatTestEnv(BaseEnv):
         slurm: bool = False,
         testing: bool = False,
     ):
-        super().__init__(config=config, server_configs=server_configs, slurm=slurm, testing=testing)
+        super().__init__(config, server_configs, slurm, testing)
         self._iter = 0
 
-        from atropos_compatible_agent import AtroposAIAgent  # noqa: WPS433
-
-        # Only expose terminal for this smoke env.
-        self._agent = AtroposAIAgent(
-            server=self.server,
-            tokenizer=self.tokenizer,
-            model=getattr(config, "server_model", "local"),
-            max_iterations=8,
-            enabled_toolsets=["terminal"],
-            tool_delay=0.0,
-            # Let the server decide token limits; we care about tool calling correctness here.
-            max_tokens=None,
-            temperature=None,
-        )
-
     @classmethod
     def config_init(cls) -> Tuple[HermesCompatTestEnvConfig, List[APIServerConfig]]:
         base_url = (
             os.getenv("ATROPOS_SERVER_BASE_URL")
             or os.getenv("OPENAI_BASE_URL")
             or os.getenv("LLM_BASE_URL")
-            or "http://localhost:11434"
+            or "http://127.0.0.1:8080"
         )
         model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
-        # Never pass through real API keys in this smoke env (they will be printed by BaseEnv config logging).
-        # Local OpenAI-compatible servers typically ignore the API key anyway.
-        api_key = "local"
+        api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
 
         env_config = HermesCompatTestEnvConfig(
-            tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct",
+            tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct",  # tokenization only
             group_size=1,
             use_wandb=False,
             include_messages=True,
@@ -116,13 +92,18 @@ class HermesCompatTestEnv(BaseEnv):
             batch_size=1,
             server_base_url=base_url,
             server_model=model,
+            # Tooling: sandbox-only terminal.
+            enabled_toolsets=["terminal"],
+            disabled_toolsets=[],
+            # Default to Nomad sandboxing; users can override via --env.* args.
+            sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local",
+            purge_job_on_shutdown=True,
         )
 
         server_configs = [
             APIServerConfig(
-                server_type="openai",
                 model_name=model,
-                base_url=f"{base_url}/v1",
+                base_url=f"{base_url.rstrip('/')}/v1",
                 api_key=api_key,
                 num_max_requests_at_once=1,
                 num_requests_for_eval=1,
@@ -131,77 +112,55 @@ class HermesCompatTestEnv(BaseEnv):
         ]
         return env_config, server_configs
 
-    async def setup(self):
+    async def setup_agent_env(self) -> None:
         return None
 
     async def get_next_item(self) -> Item:
-        # Regenerate token per task to avoid leakage across steps.
-        item = _build_forced_tool_item()
         self._iter += 1
-        return item
+        return _forced_tool_item()
 
-    async def collect_trajectory(self, item: Item):
-        prompt = item.get("prompt", "")
+    def build_task(self, item: Item) -> str:
+        return str(item.get("prompt") or "")
 
-        result = await self._agent.run_conversation_async(
-            prompt,
-            task_id=str(uuid.uuid4()),
+    def build_agent_config(self, item: Item) -> AgentConfig:  # noqa: ARG002
+        # Avoid imposing max_tokens by default; tool-tag responses can be long for some models.
+        return AgentConfig(
+            max_steps=min(8, int(self.config.agent_max_steps)),
+            temperature=0.2,
+            max_tokens=None,
         )
 
-        final = (result.get("final_response") or "").strip()
+    async def score_trajectory(self, item: Item, final_response: str) -> float:
+        # Scoring happens in verify_and_score_trajectory so we can inspect tool results.
+        _ = (item, final_response)
+        return 0.0
+
+    async def verify_and_score_trajectory(
+        self,
+        item: Item,
+        final_response: str,
+        *,
+        trajectory_id: str,  # noqa: ARG002
+        exec_tool,  # noqa: ARG002
+        agent_result: AgentResult | None = None,
+    ) -> tuple[float, Dict[str, Any]]:
+        if agent_result is None:
+            return 0.0, {"error": "Missing agent_result"}
 
-        # Verify the agent actually executed the tool by extracting stdout from the tool message.
         observed: str = ""
-        saw_tool = False
-        for msg in result.get("messages", []):
-            if msg.get("role") == "tool":
-                saw_tool = True
-                # Tool messages contain JSON strings from terminal tool.
-                try:
-                    payload = json.loads(msg.get("content") or "{}")
-                    out = (payload.get("output") or "").strip()
-                    if out:
-                        observed = out.splitlines()[-1].strip()
-                except Exception:
-                    continue
-        # Pass if:
-        # - a tool call occurred, and
-        # - the final answer matches the observed stdout exactly.
-        score = 1.0 if saw_tool and observed and final == observed else 0.0
+        tool_ok = False
+        for step in agent_result.steps:
+            for res in step.tool_results:
+                if not res.success:
+                    return 0.0, {"error": res.error, "output": res.output}
+                out = (res.output or "").strip()
+                if out:
+                    observed = out.splitlines()[-1].strip()
+                    tool_ok = True
 
-        # Tokenization fallback: build tokens/masks from final prompt + completion.
-        # Note: this is sufficient for smoke testing; production training should
-        # use a backend that supports ManagedServer state tracking.
-        system_prompt = result.get("system_prompt")
-        messages: List[Dict[str, str]] = result.get("messages", [])
-        prompt_messages = messages[:-1] if messages and messages[-1].get("role") == "assistant" else messages
-
-        if system_prompt:
-            prompt_messages = [{"role": "system", "content": system_prompt}] + prompt_messages
-
-        if hasattr(self.tokenizer, "apply_chat_template"):
-            prompt_text = self.tokenizer.apply_chat_template(
-                prompt_messages, tokenize=False, add_generation_prompt=True
-            )
-            prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
-        else:
-            prompt_text = "\n".join([f"{m['role']}: {m['content']}" for m in prompt_messages])
-            prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=True)
-
-        output_tokens = self.tokenizer.encode(final, add_special_tokens=False)
-
-        scored = {
-            "tokens": prompt_tokens + output_tokens,
-            "masks": ([-100] * len(prompt_tokens)) + output_tokens,
-            "scores": score,
-            "messages": prompt_messages + [{"role": "assistant", "content": final}],
-        }
-
-        return scored, []
-
-    async def evaluate(self, *args, **kwargs):  # noqa: ARG002
-        # Minimal eval hook for BaseEnv abstract method.
-        return {}
+        final = (final_response or "").strip()
+        score = 1.0 if tool_ok and agent_result.total_tool_calls > 0 and observed and final == observed else 0.0
+        return score, {"observed": observed, "tool_calls": agent_result.total_tool_calls, "command": item.get("command")}
 
 
 if __name__ == "__main__":
diff --git a/atropos/envs/sandbox_terminal_smoke_env.py b/atropos/envs/sandbox_terminal_smoke_env.py
new file mode 100644
index 0000000000..9c140a30b6
--- /dev/null
+++ b/atropos/envs/sandbox_terminal_smoke_env.py
@@ -0,0 +1,169 @@
+"""
+Nomad sandbox terminal smoke environment (training-oriented).
+
+Validates, end-to-end:
+  BaseEnv.process -> AgentEnv -> ToolExecutor (batched) -> Nomad SlotPool -> sandbox_server
+
+It forces the model to use a sandbox tool by asking it to run a command that
+generates a high-entropy token inside the sandbox, then repeat it exactly.
+
+Run (process mode):
+  uv run python -m atropos.envs.sandbox_terminal_smoke_env process --env.use_wandb false --env.total_steps 2 --env.group_size 1
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, List, Tuple
+
+from dotenv import load_dotenv
+from pydantic import Field
+
+from atroposlib.envs.base import APIServerConfig, Item
+
+from ..agent import AgentConfig, AgentResult
+from ..tools import ToolCall
+from .agent_env import AgentEnv, AgentEnvConfig
+
+load_dotenv()
+
+STRICT_TOOLCALL_SYSTEM_PROMPT = None
+
+
+def _forced_tool_item() -> Item:
+    # Use double quotes in the shell command and show JSON escaping explicitly.
+    # This avoids invalid JSON escapes like `\\'` (not valid JSON) that some models produce.
+    cmd = 'python -c "import secrets; print(secrets.token_hex(16))"'
+    return {
+        "command": cmd,
+        "prompt": (
+            "You MUST use the terminal tool.\n"
+            "Run this exact command:\n"
+            f"{cmd}\n"
+            "When you call the tool, use valid JSON inside <tool_call>. Example:\n"
+            '<tool_call>{"name": "terminal", "arguments": {"command": '
+            '"python -c \\\\"import secrets; print(secrets.token_hex(16))\\\\""}}'
+            "</tool_call>\n"
+            "Then respond with EXACTLY what it printed (the hex token) and nothing else.\n"
+            "Do not guess. Do not explain."
+        ),
+    }
+
+
+class SandboxTerminalSmokeEnvConfig(AgentEnvConfig):
+    server_base_url: str = Field(
+        default="http://127.0.0.1:8080",
+        description="Base URL for an OpenAI-compatible chat server (without /v1).",
+    )
+    server_model: str = Field(default="glm-4.7-flash", description="Model name")
+
+
+class SandboxTerminalSmokeEnv(AgentEnv[SandboxTerminalSmokeEnvConfig]):
+    name = "sandbox_terminal_smoke_env"
+    env_config_cls = SandboxTerminalSmokeEnvConfig
+
+    def __init__(
+        self,
+        config: SandboxTerminalSmokeEnvConfig,
+        server_configs: List[APIServerConfig],
+        slurm: bool = False,
+        testing: bool = False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+        self._iter = 0
+
+    @classmethod
+    def config_init(cls) -> Tuple[SandboxTerminalSmokeEnvConfig, List[APIServerConfig]]:
+        base_url = (
+            os.getenv("ATROPOS_SERVER_BASE_URL")
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("LLM_BASE_URL")
+            or "http://127.0.0.1:8080"
+        )
+        model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
+        api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
+
+        env_config = SandboxTerminalSmokeEnvConfig(
+            tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct",  # tokenization only
+            group_size=1,
+            use_wandb=False,
+            include_messages=True,
+            ensure_scores_are_not_same=False,
+            total_steps=2,
+            batch_size=1,
+            server_base_url=base_url,
+            server_model=model,
+            # Tooling: sandbox-only terminal.
+            enabled_toolsets=["terminal"],
+            disabled_toolsets=[],
+            # Default to Nomad sandboxing; users can override via --env.* args.
+            sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local",
+            purge_job_on_shutdown=True,
+        )
+
+        server_configs = [
+            APIServerConfig(
+                model_name=model,
+                base_url=f"{base_url.rstrip('/')}/v1",
+                api_key=api_key,
+                num_max_requests_at_once=1,
+                num_requests_for_eval=1,
+                timeout=120,
+            )
+        ]
+        return env_config, server_configs
+
+    async def setup_agent_env(self) -> None:
+        return None
+
+    async def get_next_item(self) -> Item:
+        self._iter += 1
+        return _forced_tool_item()
+
+    def build_task(self, item: Item) -> str:
+        return str(item.get("prompt") or "")
+
+    def build_agent_config(self, item: Item) -> AgentConfig:  # noqa: ARG002
+        # Avoid imposing max_tokens by default; tool-tag responses can be long for some models.
+        return AgentConfig(
+            max_steps=min(8, int(self.config.agent_max_steps)),
+            temperature=0.2,
+            max_tokens=None,
+            system_prompt=STRICT_TOOLCALL_SYSTEM_PROMPT,
+        )
+
+    async def score_trajectory(self, item: Item, final_response: str) -> float:
+        # Scoring happens in verify_and_score_trajectory so we can inspect tool results.
+        _ = (item, final_response)
+        return 0.0
+
+    async def verify_and_score_trajectory(
+        self,
+        item: Item,
+        final_response: str,
+        *,
+        trajectory_id: str,  # noqa: ARG002
+        exec_tool,  # noqa: ARG002
+        agent_result: AgentResult | None = None,
+    ) -> tuple[float, Dict[str, Any]]:
+        if agent_result is None:
+            return 0.0, {"error": "Missing agent_result"}
+
+        observed: str = ""
+        tool_ok = False
+        for step in agent_result.steps:
+            for res in step.tool_results:
+                if not res.success:
+                    return 0.0, {"error": res.error, "output": res.output}
+                out = (res.output or "").strip()
+                if out:
+                    observed = out.splitlines()[-1].strip()
+                    tool_ok = True
+
+        final = (final_response or "").strip()
+        score = 1.0 if tool_ok and agent_result.total_tool_calls > 0 and observed and final == observed else 0.0
+        return score, {"observed": observed, "tool_calls": agent_result.total_tool_calls, "command": item.get("command")}
+
+
+if __name__ == "__main__":
+    SandboxTerminalSmokeEnv.cli()
diff --git a/atropos/envs/swe_smith_oracle_env.py b/atropos/envs/swe_smith_oracle_env.py
index 756284ed2e..79f384495c 100644
--- a/atropos/envs/swe_smith_oracle_env.py
+++ b/atropos/envs/swe_smith_oracle_env.py
@@ -72,7 +72,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
             os.getenv("ATROPOS_SERVER_BASE_URL")
             or os.getenv("OPENAI_BASE_URL")
             or os.getenv("LLM_BASE_URL")
-            or "http://localhost:11434"
+            or "http://127.0.0.1:8080"
         )
         model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
         api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
@@ -252,6 +252,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
         *,
         trajectory_id: str,
         exec_tool,
+        agent_result=None,  # noqa: ARG002
     ) -> tuple[float, Dict[str, Any]]:
         _ = trajectory_id
         repo_dir = self._repo_name(item)
diff --git a/atropos/envs/test_env.py b/atropos/envs/test_env.py
index 8ae6a2f27c..7c7f08e011 100644
--- a/atropos/envs/test_env.py
+++ b/atropos/envs/test_env.py
@@ -61,7 +61,7 @@ class SimpleTestEnvConfig(AgentEnvConfig):
     """Configuration for the simple test environment."""
 
     server_base_url: str = Field(
-        default="http://localhost:11434",
+        default="http://127.0.0.1:8080",
         description="Base URL for an OpenAI-compatible server (without /v1)",
     )
     server_model: str = Field(
@@ -102,7 +102,7 @@ class SimpleTestEnv(AgentEnv[SimpleTestEnvConfig]):
             os.getenv("ATROPOS_SERVER_BASE_URL")
             or os.getenv("OPENAI_BASE_URL")
             or os.getenv("LLM_BASE_URL")
-            or "http://localhost:11434"
+            or "http://127.0.0.1:8080"
         )
         model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
         api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
diff --git a/atropos/envs/toolserver_smoke_env.py b/atropos/envs/toolserver_smoke_env.py
new file mode 100644
index 0000000000..4b39af468d
--- /dev/null
+++ b/atropos/envs/toolserver_smoke_env.py
@@ -0,0 +1,162 @@
+"""
+ToolServer routing smoke environment.
+
+Validates that:
+  - sandbox tools run through Nomad SlotPool (terminal -> bash in sandbox)
+  - external tools run through ToolServer (skills_list)
+
+This env uses ToolServer in-process by default (`tool_server_url="inprocess"`),
+so it is self-contained for local testing.
+
+Run:
+  uv run python -m atropos.envs.toolserver_smoke_env process --env.use_wandb false --env.total_steps 1 --env.group_size 1
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, List, Tuple
+
+from dotenv import load_dotenv
+from pydantic import Field
+
+from atroposlib.envs.base import APIServerConfig, Item
+
+from ..agent import AgentConfig, AgentResult
+from .agent_env import AgentEnv, AgentEnvConfig
+
+load_dotenv()
+
+
+class ToolServerSmokeEnvConfig(AgentEnvConfig):
+    server_base_url: str = Field(
+        default="http://127.0.0.1:8080",
+        description="Base URL for an OpenAI-compatible chat server (without /v1).",
+    )
+    server_model: str = Field(default="glm-4.7-flash", description="Model name")
+
+
+class ToolServerSmokeEnv(AgentEnv[ToolServerSmokeEnvConfig]):
+    name = "toolserver_smoke_env"
+    env_config_cls = ToolServerSmokeEnvConfig
+
+    def __init__(
+        self,
+        config: ToolServerSmokeEnvConfig,
+        server_configs: List[APIServerConfig],
+        slurm: bool = False,
+        testing: bool = False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+        self._iter = 0
+
+    @classmethod
+    def config_init(cls) -> Tuple[ToolServerSmokeEnvConfig, List[APIServerConfig]]:
+        base_url = (
+            os.getenv("ATROPOS_SERVER_BASE_URL")
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("LLM_BASE_URL")
+            or "http://127.0.0.1:8080"
+        )
+        model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
+        api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
+
+        env_config = ToolServerSmokeEnvConfig(
+            tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct",  # tokenization only
+            group_size=1,
+            use_wandb=False,
+            include_messages=True,
+            ensure_scores_are_not_same=False,
+            total_steps=1,
+            batch_size=1,
+            server_base_url=base_url,
+            server_model=model,
+            enabled_toolsets=["terminal", "skills"],
+            disabled_toolsets=[],
+            # Self-contained ToolServer for local smoke.
+            tool_server_url="inprocess",
+            sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local",
+            purge_job_on_shutdown=True,
+        )
+
+        server_configs = [
+            APIServerConfig(
+                model_name=model,
+                base_url=f"{base_url.rstrip('/')}/v1",
+                api_key=api_key,
+                num_max_requests_at_once=1,
+                num_requests_for_eval=1,
+                timeout=120,
+            )
+        ]
+        return env_config, server_configs
+
+    async def setup_agent_env(self) -> None:
+        return None
+
+    async def get_next_item(self) -> Item:
+        self._iter += 1
+        return {
+            "prompt": (
+                "You MUST call exactly one tool per assistant message.\n"
+                "\n"
+                "Step 1) Call the skills_list tool (no arguments), then stop.\n"
+                "Step 2) After you receive the tool response, call the terminal tool to run:\n"
+                "python -c \"print('ok')\"\n"
+                "Step 3) After you receive the terminal tool response, answer with just: ok\n"
+                "\n"
+                "Tool call format requirements:\n"
+                "- Every tool call MUST be a complete XML block with a closing tag.\n"
+                "- Do NOT emit a second <tool_call> in the same assistant message.\n"
+                "\n"
+                "Example:\n"
+                "<tool_call>{\"name\": \"skills_list\", \"arguments\": {}}</tool_call>\n"
+                "Do not include anything else in your final answer."
+            )
+        }
+
+    def build_task(self, item: Item) -> str:
+        return str(item.get("prompt") or "")
+
+    def build_agent_config(self, item: Item) -> AgentConfig:  # noqa: ARG002
+        return AgentConfig(
+            max_steps=min(10, int(self.config.agent_max_steps)),
+            temperature=0.2,
+            max_tokens=None,
+        )
+
+    async def score_trajectory(self, item: Item, final_response: str) -> float:
+        _ = (item, final_response)
+        return 0.0
+
+    async def verify_and_score_trajectory(
+        self,
+        item: Item,
+        final_response: str,
+        *,
+        trajectory_id: str,  # noqa: ARG002
+        exec_tool,  # noqa: ARG002
+        agent_result: AgentResult | None = None,
+    ) -> tuple[float, Dict[str, Any]]:
+        if agent_result is None:
+            return 0.0, {"error": "Missing agent_result"}
+
+        called = {c.name for s in agent_result.steps for c in s.tool_calls}
+        need = {"skills_list", "terminal"}
+        if not need.issubset(called):
+            return 0.0, {"error": f"Missing tool calls: {sorted(need - called)}", "called": sorted(called)}
+
+        terminal_ok = False
+        for step in agent_result.steps:
+            for call, res in zip(step.tool_calls, step.tool_results):
+                if call.name != "terminal":
+                    continue
+                if res.success and (res.output or "").strip().splitlines()[-1].strip() == "ok":
+                    terminal_ok = True
+
+        score = 1.0 if terminal_ok and (final_response or "").strip() == "ok" else 0.0
+        return score, {"called": sorted(called), "final": (final_response or "").strip()}
+
+
+if __name__ == "__main__":
+    ToolServerSmokeEnv.cli()
diff --git a/atropos/slots/pool.py b/atropos/slots/pool.py
index ba7eb683b3..d6ace16b7d 100644
--- a/atropos/slots/pool.py
+++ b/atropos/slots/pool.py
@@ -138,42 +138,47 @@ class SlotPool:
             return
         
         logger.info(f"Starting SlotPool (job_id={self.config.job_id})")
-        
-        # Check Nomad health
-        if not await self.nomad.is_healthy():
-            raise RuntimeError(f"Nomad is not reachable at {self.config.nomad_address}")
-        
-        # Check if job exists
-        job = await self.nomad.get_job(self.config.job_id)
-        
-        if job is None:
-            # Deploy new job
-            logger.info(f"Deploying sandbox job: {self.config.job_id}")
-            job_spec = create_sandbox_job(
-                job_id=self.config.job_id,
-                image=self.config.image,
-                count=self.config.min_containers,
-                slots_per_container=self.config.slots_per_container,
-                privileged=self.config.privileged,
-                cpu=self.config.cpu,
-                memory=self.config.memory,
-                datacenter=self.config.datacenter,
-            )
-            result = await self.nomad.submit_job(job_spec)
-            if "error" in result:
-                raise RuntimeError(f"Failed to submit job: {result}")
-            
-            # Wait for allocations to be running
+
+        try:
+            # Check Nomad health
+            if not await self.nomad.is_healthy():
+                raise RuntimeError(f"Nomad is not reachable at {self.config.nomad_address}")
+
+            # Check if job exists
+            job = await self.nomad.get_job(self.config.job_id)
+
+            if job is None:
+                # Deploy new job
+                logger.info(f"Deploying sandbox job: {self.config.job_id}")
+                job_spec = create_sandbox_job(
+                    job_id=self.config.job_id,
+                    image=self.config.image,
+                    count=self.config.min_containers,
+                    slots_per_container=self.config.slots_per_container,
+                    privileged=self.config.privileged,
+                    cpu=self.config.cpu,
+                    memory=self.config.memory,
+                    datacenter=self.config.datacenter,
+                )
+                result = await self.nomad.submit_job(job_spec)
+                if "error" in result:
+                    raise RuntimeError(f"Failed to submit job: {result}")
+
+            # Wait for allocations to be running (even if the job already existed).
             await self._wait_for_healthy_allocations(self.config.min_containers)
-        
-        # Discover existing allocations and slots
-        await self._refresh_slots()
-        
-        # Start health check task
-        self._health_task = asyncio.create_task(self._health_check_loop())
-        
-        self._started = True
-        logger.info(f"SlotPool started: {self.total_slots} slots available")
+
+            # Discover existing allocations and slots
+            await self._refresh_slots()
+
+            # Start health check task
+            self._health_task = asyncio.create_task(self._health_check_loop())
+
+            self._started = True
+            logger.info(f"SlotPool started: {self.total_slots} slots available")
+        except Exception:
+            # Ensure aiohttp sessions are not leaked if we fail to start.
+            await self.stop(purge_job=False)
+            raise
     
     async def stop(self, purge_job: bool = False) -> None:
         """
@@ -384,6 +389,19 @@ class SlotPool:
         """Wait for allocations to become healthy."""
         import time
         start = time.time()
+
+        def _summarize_alloc_detail(detail: Dict[str, Any]) -> str:
+            task_states = detail.get("TaskStates") or {}
+            parts: List[str] = []
+            if isinstance(task_states, dict):
+                for task_name, st in task_states.items():
+                    events = (st or {}).get("Events") or []
+                    if isinstance(events, list) and events:
+                        last = events[-1]
+                        desc = last.get("DisplayMessage") or last.get("Message") or last.get("Type") or ""
+                        if desc:
+                            parts.append(f"{task_name}: {desc}")
+            return "; ".join(parts)
         
         while time.time() - start < timeout:
             allocs = await self.nomad.get_job_allocations(self.config.job_id)
@@ -393,13 +411,45 @@ class SlotPool:
                 if alloc.status == AllocationStatus.RUNNING and alloc.http_address:
                     if await self.executor.health_check(alloc.http_address):
                         healthy_count += 1
+
+                # Fast-fail on obvious driver/image errors to avoid waiting out the full timeout.
+                if alloc.id:
+                    detail = await self.nomad.get_allocation(alloc.id)
+                    if isinstance(detail, dict):
+                        summary = _summarize_alloc_detail(detail)
+                        lowered = summary.lower()
+                        if "failed to pull" in lowered or "pull access denied" in lowered:
+                            raise RuntimeError(
+                                "Nomad allocation failed to start due to a Docker image pull error. "
+                                f"Allocation {alloc.id[:8]}: {summary}\n"
+                                "If you're using a local image tag (e.g. `atropos-sandbox:local`) on macOS, "
+                                "make sure the image is loaded into Docker (build with `docker buildx build --load ...`)."
+                            )
             
             if healthy_count >= min_count:
                 return
             
             await asyncio.sleep(2.0)
-        
-        raise RuntimeError(f"Timed out waiting for {min_count} healthy allocations")
+
+        # Timed out: include allocation status detail to help debugging.
+        allocs = await self.nomad.get_job_allocations(self.config.job_id)
+        alloc_lines: List[str] = []
+        for alloc in allocs[:10]:
+            addr = alloc.http_address or "-"
+            line = f"{alloc.id[:8]} status={alloc.status.value} http={addr}"
+            detail = await self.nomad.get_allocation(alloc.id)
+            if isinstance(detail, dict):
+                summary = _summarize_alloc_detail(detail)
+                if summary:
+                    line += f" detail={summary}"
+            alloc_lines.append(line)
+
+        hint = (
+            "Timed out waiting for healthy sandbox allocations.\n"
+            f"Job: {self.config.job_id}, desired_healthy: {min_count}\n"
+            "Allocations:\n  - " + "\n  - ".join(alloc_lines)
+        )
+        raise RuntimeError(hint)
     
     async def _try_scale_up(self) -> bool:
         """Attempt to scale up the job."""
diff --git a/atropos/tools/base.py b/atropos/tools/base.py
index 33c9d1017a..a27b8f1ad1 100644
--- a/atropos/tools/base.py
+++ b/atropos/tools/base.py
@@ -72,26 +72,65 @@ class ToolCall:
         """
         Extract tool calls from text using Hermes-style XML tags.
         
-        Format: <tool_call>{"name": "...", "arguments": {...}}</tool_call>
+        Supported formats (STRICT: requires well-formed closing tags):
+        - Hermes JSON wrapper:
+          <tool_call>{"name": "...", "arguments": {...}}</tool_call>
+        - GLM/llama.cpp style:
+          <tool_call>terminal{"command":"ls -la"}</tool_call>
         """
-        calls = []
+        calls: List["ToolCall"] = []
+
+        if not text:
+            return calls
+
+        def _append_from_payload(*, name: str, arguments: Dict[str, Any], raw: str, uniq_id: Optional[str] = None) -> None:
+            if not isinstance(name, str) or not name:
+                return
+            if not isinstance(arguments, dict):
+                return
+            calls.append(
+                cls(
+                    name=name,
+                    arguments=arguments,
+                    raw_text=raw,
+                    uniq_id=uniq_id or str(uuid.uuid4()),
+                )
+            )
+
+        # STRICT parsing: only accept well-formed <tool_call>...</tool_call> blocks.
         pattern = r"<tool_call>\s*(.*?)\s*</tool_call>"
-        matches = re.findall(pattern, text, re.DOTALL)
-        
-        for match in matches:
-            try:
-                data = json.loads(match)
-                uniq_id = data.get("uniq_id") or data.get("id") or str(uuid.uuid4())
-                calls.append(cls(
+        for inner in re.findall(pattern, text, re.DOTALL):
+            cleaned = (inner or "").strip()
+            if not cleaned:
+                continue
+
+            # Hermes JSON wrapper.
+            if cleaned.startswith("{"):
+                try:
+                    data = json.loads(cleaned)
+                except json.JSONDecodeError:
+                    continue
+                uniq_id = data.get("uniq_id") or data.get("id") or None
+                _append_from_payload(
                     name=data.get("name", ""),
                     arguments=data.get("arguments", {}),
-                    raw_text=match,
+                    raw=inner,
                     uniq_id=uniq_id,
-                ))
-            except json.JSONDecodeError:
-                # Skip malformed tool calls
+                )
                 continue
-        
+
+            # GLM/llama.cpp style: terminal{...}
+            m = re.match(r"^\s*([A-Za-z0-9_.:\\-]+)\s*(\{.*\})\s*$", cleaned, re.DOTALL)
+            if not m:
+                continue
+            name = m.group(1)
+            args_text = m.group(2)
+            try:
+                args = json.loads(args_text)
+            except json.JSONDecodeError:
+                continue
+            _append_from_payload(name=name, arguments=args, raw=inner)
+
         return calls
     
     @classmethod
@@ -208,6 +247,27 @@ class ToolRegistry:
         """Generate tool descriptions for system prompt."""
         descriptions = [tool.schema.to_prompt_description() for tool in self._tools.values()]
         return "\n\n".join(descriptions)
+
+    def get_prompt_tool_definitions_json(self) -> str:
+        """
+        Return a Hermes-style JSON list of tool definitions for use inside a `<tools>...</tools>` block.
+
+        Hermes trajectories historically use a simplified schema list:
+          [{"name": ..., "description": ..., "parameters": {...}, "required": null}, ...]
+        """
+        formatted: List[Dict[str, Any]] = []
+        for tool in self._tools.values():
+            fn = tool.schema.to_dict().get("function", {})
+            formatted.append(
+                {
+                    "name": fn.get("name", tool.name),
+                    "description": fn.get("description", ""),
+                    "parameters": fn.get("parameters", {}),
+                    # Keep parity with Hermes saved trajectories (required is typically null there).
+                    "required": None,
+                }
+            )
+        return json.dumps(formatted, ensure_ascii=False)
     
     async def execute(self, call: ToolCall) -> ToolResult:
         """Execute a tool call."""
diff --git a/hermes_agent.egg-info/PKG-INFO b/hermes_agent.egg-info/PKG-INFO
index 98fb7f6d68..d17e541623 100644
--- a/hermes_agent.egg-info/PKG-INFO
+++ b/hermes_agent.egg-info/PKG-INFO
@@ -29,7 +29,7 @@ Provides-Extra: dev
 Requires-Dist: pytest; extra == "dev"
 Requires-Dist: pytest-asyncio; extra == "dev"
 Provides-Extra: atropos
-Requires-Dist: atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git ; extra == "atropos"
+Requires-Dist: atroposlib @ git+https://github.com/NousResearch/atropos.git ; extra == "atropos"
 Requires-Dist: aiohttp; extra == "atropos"
 Requires-Dist: fastapi; extra == "atropos"
 Requires-Dist: uvicorn; extra == "atropos"
diff --git a/hermes_agent.egg-info/SOURCES.txt b/hermes_agent.egg-info/SOURCES.txt
index 3464034f41..87e0cd7d93 100644
--- a/hermes_agent.egg-info/SOURCES.txt
+++ b/hermes_agent.egg-info/SOURCES.txt
@@ -17,9 +17,10 @@ atropos/api/tool_executor_server.py
 atropos/api/tool_server.py
 atropos/envs/__init__.py
 atropos/envs/agent_env.py
-atropos/envs/hermes_compat_test_env.py
+atropos/envs/sandbox_terminal_smoke_env.py
 atropos/envs/swe_smith_oracle_env.py
 atropos/envs/test_env.py
+atropos/envs/toolserver_smoke_env.py
 atropos/nomad/__init__.py
 atropos/nomad/client.py
 atropos/slots/__init__.py
@@ -30,18 +31,13 @@ atropos/terminal/__init__.py
 atropos/terminal/asciinema_stream.py
 atropos/tools/__init__.py
 atropos/tools/base.py
-atropos/tools/basic_tools.py
-atropos/tools/image_generation_tool.py
-atropos/tools/mixture_of_agents_tool.py
-atropos/tools/terminal_hecate.py
+atropos/tools/build_registry.py
+atropos/tools/hermes_external_tools.py
+atropos/tools/sandbox_stubs.py
 atropos/tools/terminal_stateful_tool.py
-atropos/tools/terminal_tool.py
 atropos/tools/tmux_tool.py
 atropos/tools/tool_executor.py
-atropos/tools/toolset_distributions.py
-atropos/tools/toolsets.py
-atropos/tools/vision_tools.py
-atropos/tools/web_tools.py
+atropos/tools/toolset_resolver.py
 hermes_agent.egg-info/PKG-INFO
 hermes_agent.egg-info/SOURCES.txt
 hermes_agent.egg-info/dependency_links.txt
diff --git a/hermes_agent.egg-info/entry_points.txt b/hermes_agent.egg-info/entry_points.txt
index 2e72c8e210..42fd7548f8 100644
--- a/hermes_agent.egg-info/entry_points.txt
+++ b/hermes_agent.egg-info/entry_points.txt
@@ -1,2 +1,4 @@
 [console_scripts]
 hermes-agent = run_agent:main
+hermes-atropos-sandbox-smoke = atropos.envs.sandbox_terminal_smoke_env:SandboxTerminalSmokeEnv.cli
+hermes-atropos-toolserver-smoke = atropos.envs.toolserver_smoke_env:ToolServerSmokeEnv.cli
diff --git a/hermes_agent.egg-info/requires.txt b/hermes_agent.egg-info/requires.txt
index 0ef7437426..6e9ad30e5a 100644
--- a/hermes_agent.egg-info/requires.txt
+++ b/hermes_agent.egg-info/requires.txt
@@ -16,7 +16,7 @@ typer
 platformdirs
 
 [atropos]
-atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git
+atroposlib @ git+https://github.com/NousResearch/atropos.git
 aiohttp
 fastapi
 uvicorn
diff --git a/pyproject.toml b/pyproject.toml
index aa78046db3..328a16a0fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ modal = ["modal", "boto3"]
 dev = ["pytest", "pytest-asyncio"]
 # Install Atropos from source (PyPI is often stale for this internal dependency).
 atropos = [
-  "atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git",
+  "atroposlib @ git+https://github.com/NousResearch/atropos.git",
   # Atropos integration runtime deps (kept optional for Hermes-only users)
   "aiohttp",
   "fastapi",
@@ -47,6 +47,8 @@ atropos = [
 
 [project.scripts]
 hermes-agent = "run_agent:main"
+hermes-atropos-sandbox-smoke = "atropos.envs.sandbox_terminal_smoke_env:SandboxTerminalSmokeEnv.cli"
+hermes-atropos-toolserver-smoke = "atropos.envs.toolserver_smoke_env:ToolServerSmokeEnv.cli"
 
 [tool.setuptools]
 py-modules = [
diff --git a/scripts/launch_llama_cpp_glm47_flash.sh b/scripts/launch_llama_cpp_glm47_flash.sh
new file mode 100755
index 0000000000..3f3716a2fa
--- /dev/null
+++ b/scripts/launch_llama_cpp_glm47_flash.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Launch a local llama.cpp OpenAI-compatible server running GLM-4.7-Flash (GGUF).
+#
+# Requires:
+# - `llama-server` installed (e.g. `brew install llama.cpp`)
+#
+# Default settings are chosen to avoid clashing with Atropos sandbox_server
+# (which commonly uses port 8080 in local dev).
+#
+# Usage:
+#   Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh
+#
+# Override defaults:
+#   LLAMA_CPP_HOST=127.0.0.1 LLAMA_CPP_PORT=8082 \
+#   LLAMA_CPP_HF_REPO=ggml-org/GLM-4.7-Flash-GGUF \
+#   LLAMA_CPP_HF_FILE=GLM-4.7-Flash-Q4_K.gguf \
+#   Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh
+
+HOST="${LLAMA_CPP_HOST:-127.0.0.1}"
+PORT="${LLAMA_CPP_PORT:-8080}"
+HF_REPO="${LLAMA_CPP_HF_REPO:-ggml-org/GLM-4.7-Flash-GGUF}"
+HF_FILE="${LLAMA_CPP_HF_FILE:-GLM-4.7-Flash-Q4_K.gguf}"
+ALIAS="${LLAMA_CPP_ALIAS:-glm-4.7-flash}"
+
+if ! command -v llama-server >/dev/null 2>&1; then
+  echo "Error: llama-server not found in PATH."
+  echo "Install via Homebrew: brew install llama.cpp"
+  exit 1
+fi
+
+echo "Launching llama.cpp server..."
+echo "  host:  $HOST"
+echo "  port:  $PORT"
+echo "  repo:  $HF_REPO"
+echo "  file:  $HF_FILE"
+echo "  alias: $ALIAS"
+echo
+echo "Suggested env vars for Hermes/Atropos integration:"
+echo "  export ATROPOS_SERVER_BASE_URL=http://${HOST}:${PORT}"
+echo "  export ATROPOS_SERVER_MODEL=${ALIAS}"
+echo "  export ATROPOS_SERVER_API_KEY=local"
+echo
+
+if command -v lsof >/dev/null 2>&1; then
+  if lsof -nP -iTCP:"$PORT" -sTCP:LISTEN >/dev/null 2>&1; then
+    echo "Error: port $PORT is already in use."
+    echo "Pick a different port, e.g.:"
+    echo "  LLAMA_CPP_PORT=8082 Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh"
+    exit 1
+  fi
+fi
+
+exec llama-server \
+  --host "$HOST" \
+  --port "$PORT" \
+  --hf-repo "$HF_REPO" \
+  --hf-file "$HF_FILE" \
+  --alias "$ALIAS" \
+  -c 32768 \
+  -n -1
diff --git a/tests/test_tool_call_parsing.py b/tests/test_tool_call_parsing.py
new file mode 100644
index 0000000000..49db864b24
--- /dev/null
+++ b/tests/test_tool_call_parsing.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from atropos.tools.base import ToolCall
+
+
+def test_parse_tool_call_json_wrapper() -> None:
+    text = '<tool_call>{"name":"terminal","arguments":{"command":"pwd"}}</tool_call>'
+    calls = ToolCall.parse_from_text(text)
+    assert len(calls) == 1
+    assert calls[0].name == "terminal"
+    assert calls[0].arguments == {"command": "pwd"}
+
+
+def test_parse_tool_call_glm_style() -> None:
+    text = '<tool_call>terminal{"command":"ls -la"}</tool_call>'
+    calls = ToolCall.parse_from_text(text)
+    assert len(calls) == 1
+    assert calls[0].name == "terminal"
+    assert calls[0].arguments == {"command": "ls -la"}
+
+
+def test_parse_tool_call_missing_close_tag() -> None:
+    text = '<tool_call>terminal{"command":"echo hi"}'
+    calls = ToolCall.parse_from_text(text)
+    assert calls == []
+
+
+def test_parse_tool_call_strips_accidental_xml() -> None:
+    text = '<tool_call>terminal{"command":"ls -la"}</arg_value></tool_call>'
+    calls = ToolCall.parse_from_text(text)
+    assert calls == []
diff --git a/uv.lock b/uv.lock
index 52eaaccbe0..0cb6f730fc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -218,7 +218,7 @@ wheels = [
 [[package]]
 name = "atroposlib"
 version = "0.3.0"
-source = { git = "ssh://git@github.com/NousResearch/atropos.git#462abbebf75f44e811116c3730ce9874c4358a80" }
+source = { git = "https://github.com/NousResearch/atropos.git#462abbebf75f44e811116c3730ce9874c4358a80" }
 dependencies = [
     { name = "aiofiles" },
     { name = "aiohttp" },
@@ -913,7 +913,7 @@ modal = [
 [package.metadata]
 requires-dist = [
     { name = "aiohttp", marker = "extra == 'atropos'" },
-    { name = "atroposlib", marker = "extra == 'atropos'", git = "ssh://git@github.com/NousResearch/atropos.git" },
+    { name = "atroposlib", marker = "extra == 'atropos'", git = "https://github.com/NousResearch/atropos.git" },
     { name = "boto3", marker = "extra == 'modal'" },
     { name = "fal-client" },
     { name = "fastapi", marker = "extra == 'atropos'" },