smokes working, fixing up toolserver. switched to llama.cpp, ollama sucks too much

2026-05-16 04:22:36 +00:00 · 2026-02-03 11:41:34 +10:00 · 2026-02-03 11:41:34 +10:00 · 16fb41f9cc
commit 16fb41f9cc
parent 4939130485
18 changed files with 822 additions and 238 deletions
--- a/atropos/envs/sandbox_terminal_smoke_env.py
+++ b/atropos/envs/sandbox_terminal_smoke_env.py
@ -0,0 +1,169 @@
+"""
+Nomad sandbox terminal smoke environment (training-oriented).
+
+Validates, end-to-end:
+  BaseEnv.process -> AgentEnv -> ToolExecutor (batched) -> Nomad SlotPool -> sandbox_server
+
+It forces the model to use a sandbox tool by asking it to run a command that
+generates a high-entropy token inside the sandbox, then repeat it exactly.
+
+Run (process mode):
+  uv run python -m atropos.envs.sandbox_terminal_smoke_env process --env.use_wandb false --env.total_steps 2 --env.group_size 1
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, List, Tuple
+
+from dotenv import load_dotenv
+from pydantic import Field
+
+from atroposlib.envs.base import APIServerConfig, Item
+
+from ..agent import AgentConfig, AgentResult
+from ..tools import ToolCall
+from .agent_env import AgentEnv, AgentEnvConfig
+
+load_dotenv()
+
+STRICT_TOOLCALL_SYSTEM_PROMPT = None
+
+
+def _forced_tool_item() -> Item:
+    # Use double quotes in the shell command and show JSON escaping explicitly.
+    # This avoids invalid JSON escapes like `\\'` (not valid JSON) that some models produce.
+    cmd = 'python -c "import secrets; print(secrets.token_hex(16))"'
+    return {
+        "command": cmd,
+        "prompt": (
+            "You MUST use the terminal tool.\n"
+            "Run this exact command:\n"
+            f"{cmd}\n"
+            "When you call the tool, use valid JSON inside <tool_call>. Example:\n"
+            '<tool_call>{"name": "terminal", "arguments": {"command": '
+            '"python -c \\\\"import secrets; print(secrets.token_hex(16))\\\\""}}'
+            "</tool_call>\n"
+            "Then respond with EXACTLY what it printed (the hex token) and nothing else.\n"
+            "Do not guess. Do not explain."
+        ),
+    }
+
+
+class SandboxTerminalSmokeEnvConfig(AgentEnvConfig):
+    server_base_url: str = Field(
+        default="http://127.0.0.1:8080",
+        description="Base URL for an OpenAI-compatible chat server (without /v1).",
+    )
+    server_model: str = Field(default="glm-4.7-flash", description="Model name")
+
+
+class SandboxTerminalSmokeEnv(AgentEnv[SandboxTerminalSmokeEnvConfig]):
+    name = "sandbox_terminal_smoke_env"
+    env_config_cls = SandboxTerminalSmokeEnvConfig
+
+    def __init__(
+        self,
+        config: SandboxTerminalSmokeEnvConfig,
+        server_configs: List[APIServerConfig],
+        slurm: bool = False,
+        testing: bool = False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+        self._iter = 0
+
+    @classmethod
+    def config_init(cls) -> Tuple[SandboxTerminalSmokeEnvConfig, List[APIServerConfig]]:
+        base_url = (
+            os.getenv("ATROPOS_SERVER_BASE_URL")
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("LLM_BASE_URL")
+            or "http://127.0.0.1:8080"
+        )
+        model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash"
+        api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local"
+
+        env_config = SandboxTerminalSmokeEnvConfig(
+            tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct",  # tokenization only
+            group_size=1,
+            use_wandb=False,
+            include_messages=True,
+            ensure_scores_are_not_same=False,
+            total_steps=2,
+            batch_size=1,
+            server_base_url=base_url,
+            server_model=model,
+            # Tooling: sandbox-only terminal.
+            enabled_toolsets=["terminal"],
+            disabled_toolsets=[],
+            # Default to Nomad sandboxing; users can override via --env.* args.
+            sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local",
+            purge_job_on_shutdown=True,
+        )
+
+        server_configs = [
+            APIServerConfig(
+                model_name=model,
+                base_url=f"{base_url.rstrip('/')}/v1",
+                api_key=api_key,
+                num_max_requests_at_once=1,
+                num_requests_for_eval=1,
+                timeout=120,
+            )
+        ]
+        return env_config, server_configs
+
+    async def setup_agent_env(self) -> None:
+        return None
+
+    async def get_next_item(self) -> Item:
+        self._iter += 1
+        return _forced_tool_item()
+
+    def build_task(self, item: Item) -> str:
+        return str(item.get("prompt") or "")
+
+    def build_agent_config(self, item: Item) -> AgentConfig:  # noqa: ARG002
+        # Avoid imposing max_tokens by default; tool-tag responses can be long for some models.
+        return AgentConfig(
+            max_steps=min(8, int(self.config.agent_max_steps)),
+            temperature=0.2,
+            max_tokens=None,
+            system_prompt=STRICT_TOOLCALL_SYSTEM_PROMPT,
+        )
+
+    async def score_trajectory(self, item: Item, final_response: str) -> float:
+        # Scoring happens in verify_and_score_trajectory so we can inspect tool results.
+        _ = (item, final_response)
+        return 0.0
+
+    async def verify_and_score_trajectory(
+        self,
+        item: Item,
+        final_response: str,
+        *,
+        trajectory_id: str,  # noqa: ARG002
+        exec_tool,  # noqa: ARG002
+        agent_result: AgentResult | None = None,
+    ) -> tuple[float, Dict[str, Any]]:
+        if agent_result is None:
+            return 0.0, {"error": "Missing agent_result"}
+
+        observed: str = ""
+        tool_ok = False
+        for step in agent_result.steps:
+            for res in step.tool_results:
+                if not res.success:
+                    return 0.0, {"error": res.error, "output": res.output}
+                out = (res.output or "").strip()
+                if out:
+                    observed = out.splitlines()[-1].strip()
+                    tool_ok = True
+
+        final = (final_response or "").strip()
+        score = 1.0 if tool_ok and agent_result.total_tool_calls > 0 and observed and final == observed else 0.0
+        return score, {"observed": observed, "tool_calls": agent_result.total_tool_calls, "command": item.get("command")}
+
+
+if __name__ == "__main__":
+    SandboxTerminalSmokeEnv.cli()