From 16fb41f9cc77b70762aaeef42b0e1efd16b2ad6f Mon Sep 17 00:00:00 2001 From: Shannon Sands Date: Tue, 3 Feb 2026 11:41:34 +1000 Subject: [PATCH] smokes working, fixing up toolserver. switched to llama.cpp, ollama sucks too much --- .env.example | 5 +- atropos/agent/atropos_agent.py | 168 ++++++++++++----- atropos/envs/agent_env.py | 11 +- atropos/envs/hermes_compat_test_env.py | 203 ++++++++------------- atropos/envs/sandbox_terminal_smoke_env.py | 169 +++++++++++++++++ atropos/envs/swe_smith_oracle_env.py | 3 +- atropos/envs/test_env.py | 4 +- atropos/envs/toolserver_smoke_env.py | 162 ++++++++++++++++ atropos/slots/pool.py | 124 +++++++++---- atropos/tools/base.py | 88 +++++++-- hermes_agent.egg-info/PKG-INFO | 2 +- hermes_agent.egg-info/SOURCES.txt | 16 +- hermes_agent.egg-info/entry_points.txt | 2 + hermes_agent.egg-info/requires.txt | 2 +- pyproject.toml | 4 +- scripts/launch_llama_cpp_glm47_flash.sh | 62 +++++++ tests/test_tool_call_parsing.py | 31 ++++ uv.lock | 4 +- 18 files changed, 822 insertions(+), 238 deletions(-) create mode 100644 atropos/envs/sandbox_terminal_smoke_env.py create mode 100644 atropos/envs/toolserver_smoke_env.py create mode 100755 scripts/launch_llama_cpp_glm47_flash.sh create mode 100644 tests/test_tool_call_parsing.py diff --git a/.env.example b/.env.example index 4c13afec63..1f2cba1da5 100644 --- a/.env.example +++ b/.env.example @@ -22,12 +22,13 @@ HERMES_BACKEND=openai # of OpenRouter. # # Local server convenience (base URL without /v1): -# ATROPOS_SERVER_BASE_URL=http://localhost:11434 +# llama.cpp example (see `Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh`): +# ATROPOS_SERVER_BASE_URL=http://127.0.0.1:8080 # ATROPOS_SERVER_MODEL=glm-4.7-flash # ATROPOS_SERVER_API_KEY=local # # Generic OpenAI-compatible (base URL should include /v1): -# OPENAI_BASE_URL=http://localhost:11434/v1 +# OPENAI_BASE_URL=http://127.0.0.1:8080/v1 # OPENAI_API_KEY=local # ============================================================================= diff --git a/atropos/agent/atropos_agent.py b/atropos/agent/atropos_agent.py index 9ea6e3044e..2fb1de6d68 100644 --- a/atropos/agent/atropos_agent.py +++ b/atropos/agent/atropos_agent.py @@ -15,6 +15,7 @@ The agent uses Hermes-style XML tags for tool calls: import asyncio import os +import json from contextlib import asynccontextmanager from dataclasses import dataclass, field from typing import Any, AsyncGenerator, Awaitable, Callable, Dict, List, Optional, Union @@ -27,33 +28,66 @@ from atroposlib.envs.server_handling.managed_server import ManagedServer load_dotenv() -# Default system prompt with tool calling instructions -AGENT_SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. You can use tools to accomplish tasks. +# Default system prompt with tool calling instructions. +# +# IMPORTANT: In training-mode environments we want "raw text in -> raw text out" and we +# parse tool calls from completion text. Do not rely on server-specific `tool_calls` fields. +AGENT_SYSTEM_PROMPT = """You are a function-calling AI model. -## Available Tools +You are provided with function signatures within XML tags. +You may call one or more functions to assist with the user query. If available tools are not relevant, +respond in natural language. + +After calling & executing a function, you will be provided with function results within + XML tags. + +Here are the available tools: -{tool_descriptions} +{tools_json} -## How to Use Tools -To use a tool, output a tool call in the following format: -{{"name": "tool_name", "arguments": {{"arg1": "value1", "arg2": "value2"}}}} +## REQUIRED TOOL FORMAT -You may reason about what to do before calling a tool: -I need to check what files are in the current directory... -{{"name": "bash", "arguments": {{"command": "ls -la"}}}} +When you decide to call a tool, your assistant message MUST be: +1) exactly one ... block, followed by +2) one or more ... blocks, +and NOTHING else in that message. -After a tool is executed, you will receive the result: -{{"success": true, "output": "..."}} +For each tool call, output a JSON object with this schema: +{"name": "function_name", "arguments": { ... }} -Continue using tools as needed until you have completed the task. -When you have finished, provide your final response without any tool calls. +Each tool call MUST be enclosed within XML tags. +The JSON inside MUST be valid JSON with double quotes. -## Important Guidelines -- Think step by step about what you need to do -- Use tools to gather information and perform actions -- If a tool call fails, analyze the error and try a different approach -- Provide clear, concise responses when the task is complete +Do NOT output in an assistant message. + +After you receive tool results, you may either call more tools (same required format) or provide the final answer. +When providing the final answer, do NOT include any blocks. + +## ICL (examples) + +User: Show the current directory. +Assistant: +I should use the terminal tool to print the current directory. +{"name": "terminal", "arguments": {"command": "pwd"}} +User: {"success": true, "output": "/tmp\\n"} +Assistant: /tmp + +User: List files, then count them. +Assistant: +I should list files and count lines. +{"name": "terminal", "arguments": {"command": "ls -1 | wc -l"}} +User: {"success": true, "output": "3\\n"} +Assistant: 3 + +User: Run pwd, then print ok. +Assistant: +I should run pwd, then run a command that prints ok. +{"name": "terminal", "arguments": {"command": "pwd"}} +{"name": "terminal", "arguments": {"command": "echo ok"}} +User: {"success": true, "output": "/tmp\\n"} +User: {"success": true, "output": "ok\\n"} +Assistant: ok """ @@ -62,8 +96,9 @@ class AgentConfig: """Configuration for the AtroposAgent.""" # Generation parameters - temperature: float = 0.7 - max_tokens: int = 4096 + temperature: Optional[float] = 0.7 + # Default to "let the backend decide" (important for tool-tag completions that may be longer). + max_tokens: Optional[int] = None # Agent behavior max_steps: int = 50 @@ -222,13 +257,53 @@ class AtroposAgent: """Build the system prompt with tool descriptions.""" if self.config.system_prompt: return self.config.system_prompt - - tool_descriptions = self.tools.get_prompt_description() - if not tool_descriptions: - tool_descriptions = "(No tools available)" - - return AGENT_SYSTEM_PROMPT.format(tool_descriptions=tool_descriptions) - + + tools_json = self.tools.get_prompt_tool_definitions_json() + # Avoid `str.format()` here because the prompt contains many literal `{}` braces + # in JSON examples; we only want to substitute the single `{tools_json}` token. + return AGENT_SYSTEM_PROMPT.replace("{tools_json}", tools_json) + + def _debug_dump_request(self, *, step_num: int, chat_kwargs: Dict[str, Any]) -> None: + if os.getenv("ATROPOS_DEBUG_AGENT_REQUEST") != "1": + return + try: + # Avoid dumping megabytes by default; messages can be huge. + meta = { + "step": step_num, + "chat_kwargs_keys": sorted(list(chat_kwargs.keys())), + "n": chat_kwargs.get("n"), + "max_tokens": chat_kwargs.get("max_tokens"), + "temperature": chat_kwargs.get("temperature"), + "num_messages": len(chat_kwargs.get("messages") or []), + } + print("\n=== ATROPOS_DEBUG_AGENT_REQUEST ===", flush=True) + print(meta, flush=True) + + if os.getenv("ATROPOS_DEBUG_AGENT_REQUEST_FULL") == "1": + payload = dict(chat_kwargs) + # Make the payload more legible and less huge. + try: + dumped = json.dumps(payload, ensure_ascii=False, indent=2) + except Exception: + dumped = repr(payload) + print("\n=== ATROPOS_DEBUG_AGENT_REQUEST_FULL ===", flush=True) + print(dumped[:200_000], flush=True) + except Exception: + return + + def _debug_dump_response(self, *, step_num: int, response: Any) -> None: + if os.getenv("ATROPOS_DEBUG_AGENT_RESPONSE") != "1": + return + print("\n=== ATROPOS_DEBUG_AGENT_RESPONSE ===", flush=True) + print({"step": step_num, "type": type(response).__name__}, flush=True) + try: + dumped = response.model_dump() # openai pydantic model + except Exception: + dumped = getattr(response, "__dict__", {"repr": repr(response)}) + # Keep the dump bounded; we only need enough to see the assistant message content. + text = str(dumped) + print(text[:200_000], flush=True) + async def run( self, task: str, @@ -265,12 +340,15 @@ class AtroposAgent: # Keep a copy of the prompt messages used for this completion. # Useful for reconstructing tokens/masks when state tracking is unavailable. prompt_messages = list(messages) - response = await managed.chat_completion( - messages=messages, - n=1, - max_tokens=self.config.max_tokens, - temperature=self.config.temperature, - ) + chat_kwargs: Dict[str, Any] = {"messages": messages, "n": 1} + if self.config.max_tokens is not None: + chat_kwargs["max_tokens"] = self.config.max_tokens + if self.config.temperature is not None: + chat_kwargs["temperature"] = self.config.temperature + + self._debug_dump_request(step_num=step_num + 1, chat_kwargs=chat_kwargs) + response = await managed.chat_completion(**chat_kwargs) + self._debug_dump_response(step_num=step_num + 1, response=response) current_node = None if hasattr(managed, "get_state"): @@ -286,7 +364,9 @@ class AtroposAgent: error=f"Generation error: {str(e)}", ) - response_text = response.choices[0].message.content or "" + msg = response.choices[0].message + # Some OpenAI-compatible servers populate `message.reasoning` and leave `content=""`. + response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "") tool_calls = ToolCall.parse_from_text(response_text) step = AgentStep( @@ -380,12 +460,15 @@ class AtroposAgent: Tuple of (response_text, tool_results, sequence_data) """ async with self._managed() as managed: - response = await managed.chat_completion( - messages=messages, - n=1, - max_tokens=self.config.max_tokens, - temperature=self.config.temperature, - ) + chat_kwargs: Dict[str, Any] = {"messages": messages, "n": 1} + if self.config.max_tokens is not None: + chat_kwargs["max_tokens"] = self.config.max_tokens + if self.config.temperature is not None: + chat_kwargs["temperature"] = self.config.temperature + + self._debug_dump_request(step_num=1, chat_kwargs=chat_kwargs) + response = await managed.chat_completion(**chat_kwargs) + self._debug_dump_response(step_num=1, response=response) current_node = None if hasattr(managed, "get_state"): @@ -393,7 +476,8 @@ class AtroposAgent: nodes = state.get("nodes", []) current_node = nodes[-1] if nodes else None - response_text = response.choices[0].message.content or "" + msg = response.choices[0].message + response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "") tool_results = [] if execute_tools: diff --git a/atropos/envs/agent_env.py b/atropos/envs/agent_env.py index 0e88a22237..789ea88b7c 100644 --- a/atropos/envs/agent_env.py +++ b/atropos/envs/agent_env.py @@ -18,7 +18,7 @@ from pydantic import Field from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig, Item, ScoredDataGroup, ScoredDataItem -from ..agent import AgentConfig, AtroposAgent +from ..agent import AgentConfig, AgentResult, AtroposAgent from ..slots import SlotPool, SlotPoolConfig from ..tools import ToolRegistry, build_tool_registry from ..tools.tool_executor import ToolExecutor, ToolExecutorConfig @@ -56,7 +56,10 @@ class AgentEnvConfig(BaseEnvConfig): # basic agent defaults agent_max_steps: int = Field(default=50, description="Max ReACT steps per trajectory") agent_temperature: float = Field(default=0.7, description="Sampling temperature") - agent_max_tokens: int = Field(default=4096, description="Max tokens per model response") + agent_max_tokens: Optional[int] = Field( + default=None, + description="Max tokens per model response (default: let backend decide)", + ) agent_tool_delay_s: float = Field(default=0.0, description="Delay between tool calls (seconds)") # tool selection @@ -143,6 +146,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]): *, trajectory_id: str, exec_tool: Callable[["ToolCall"], Awaitable["ToolResult"]], + agent_result: Optional[AgentResult] = None, ) -> tuple[float, Dict[str, Any]]: """ Optional hook: run in-sandbox verification before scoring. @@ -152,7 +156,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]): Default: calls `score_trajectory()` and returns empty metadata. """ - _ = (trajectory_id, exec_tool) # default ignores in-workspace verification + _ = (trajectory_id, exec_tool, agent_result) # default ignores in-workspace verification score = await self.score_trajectory(item, final_response) return score, {} @@ -299,6 +303,7 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]): result.final_response, trajectory_id=trajectory_id, exec_tool=_exec, + agent_result=result, ) messages = [{"role": "system", "content": agent._build_system_prompt()}] # noqa: SLF001 diff --git a/atropos/envs/hermes_compat_test_env.py b/atropos/envs/hermes_compat_test_env.py index 270b507f83..93b0fe2dd9 100644 --- a/atropos/envs/hermes_compat_test_env.py +++ b/atropos/envs/hermes_compat_test_env.py @@ -1,70 +1,63 @@ """ -Hermes-Agent (Atropos-compatible) smoke environment. +Hermes-Agent + Atropos (Nomad sandbox) compatibility smoke environment. -This is a minimal `BaseEnv` environment that uses Hermes-Agent's Atropos-backed -runner (`AtroposAIAgent`) and can be exercised via `BaseEnv`'s `process` mode. +This environment is intended to validate, end-to-end: + BaseEnv.process -> AgentEnv -> ToolExecutor (batched) -> Nomad SlotPool -> sandbox_server -This deliberately does NOT use slot multiplexing / sandboxes yet (stage 1). +It forces the model to use a sandbox tool by asking it to run a command that +generates a high-entropy token inside the sandbox, then repeat it exactly. + +Run (process mode): + uv run python -m atropos.envs.hermes_compat_test_env process --env.use_wandb false --env.total_steps 2 --env.group_size 1 """ from __future__ import annotations -import json import os -import uuid -from typing import Dict, List, Tuple +from typing import Any, Dict, List, Tuple from dotenv import load_dotenv from pydantic import Field -from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig, Item +from atroposlib.envs.base import APIServerConfig, Item + +from ..agent import AgentConfig, AgentResult +from ..tools import ToolCall +from .agent_env import AgentEnv, AgentEnvConfig load_dotenv() -def _build_forced_tool_item() -> Item: - """ - Construct a task that *cannot* be completed reliably without executing a tool. - - We generate a high-entropy token *inside the tool execution* and ask the agent to - repeat it exactly. Scoring verifies that: - - a terminal tool call occurred (role="tool" message present), and - - the final answer matches the tool stdout exactly. - """ +def _forced_tool_item() -> Item: + # Use double quotes in the shell command and show JSON escaping explicitly. + # This avoids invalid JSON escapes like `\\'` (not valid JSON) that some models produce. + cmd = 'python -c "import secrets; print(secrets.token_hex(16))"' return { - "command": "python -c \"import secrets; print(secrets.token_hex(16))\"", + "command": cmd, "prompt": ( - "Use the terminal tool to run:\n" - "python -c \"import secrets; print(secrets.token_hex(16))\"\n" - "Then answer with EXACTLY what it printed and nothing else." + "You are acting as an agent inside a sandboxed environment.\n" + "You MUST use the terminal tool to execute commands.\n" + "Run this exact command:\n" + f"{cmd}\n" + "When you call the tool, use valid JSON inside . Example:\n" + '{"name": "terminal", "arguments": {"command": ' + '"python -c \\\\"import secrets; print(secrets.token_hex(16))\\\\""}}' + "\n" + "Then respond with EXACTLY what it printed (the hex token) and nothing else.\n" + "Do not guess. Do not explain." ), } -TEST_ITEMS: List[Item] = [ - _build_forced_tool_item(), - _build_forced_tool_item(), -] - - -class HermesCompatTestEnvConfig(BaseEnvConfig): - """Config for HermesCompatTestEnv.""" - +class HermesCompatTestEnvConfig(AgentEnvConfig): server_base_url: str = Field( - default="http://localhost:11434", + default="http://127.0.0.1:8080", description="Base URL for an OpenAI-compatible chat server (without /v1).", ) server_model: str = Field(default="glm-4.7-flash", description="Model name") -class HermesCompatTestEnv(BaseEnv): - """ - Minimal BaseEnv that runs Hermes-Agent's Atropos-compatible agent loop. - - Run (process mode): - uv run atropos-agent-hermes-compat-test process --env.use_wandb false --env.total_steps 2 --env.group_size 1 - """ - +class HermesCompatTestEnv(AgentEnv[HermesCompatTestEnvConfig]): name = "hermes_compat_test_env" env_config_cls = HermesCompatTestEnvConfig @@ -75,39 +68,22 @@ class HermesCompatTestEnv(BaseEnv): slurm: bool = False, testing: bool = False, ): - super().__init__(config=config, server_configs=server_configs, slurm=slurm, testing=testing) + super().__init__(config, server_configs, slurm, testing) self._iter = 0 - from atropos_compatible_agent import AtroposAIAgent # noqa: WPS433 - - # Only expose terminal for this smoke env. - self._agent = AtroposAIAgent( - server=self.server, - tokenizer=self.tokenizer, - model=getattr(config, "server_model", "local"), - max_iterations=8, - enabled_toolsets=["terminal"], - tool_delay=0.0, - # Let the server decide token limits; we care about tool calling correctness here. - max_tokens=None, - temperature=None, - ) - @classmethod def config_init(cls) -> Tuple[HermesCompatTestEnvConfig, List[APIServerConfig]]: base_url = ( os.getenv("ATROPOS_SERVER_BASE_URL") or os.getenv("OPENAI_BASE_URL") or os.getenv("LLM_BASE_URL") - or "http://localhost:11434" + or "http://127.0.0.1:8080" ) model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" - # Never pass through real API keys in this smoke env (they will be printed by BaseEnv config logging). - # Local OpenAI-compatible servers typically ignore the API key anyway. - api_key = "local" + api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" env_config = HermesCompatTestEnvConfig( - tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct", + tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct", # tokenization only group_size=1, use_wandb=False, include_messages=True, @@ -116,13 +92,18 @@ class HermesCompatTestEnv(BaseEnv): batch_size=1, server_base_url=base_url, server_model=model, + # Tooling: sandbox-only terminal. + enabled_toolsets=["terminal"], + disabled_toolsets=[], + # Default to Nomad sandboxing; users can override via --env.* args. + sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local", + purge_job_on_shutdown=True, ) server_configs = [ APIServerConfig( - server_type="openai", model_name=model, - base_url=f"{base_url}/v1", + base_url=f"{base_url.rstrip('/')}/v1", api_key=api_key, num_max_requests_at_once=1, num_requests_for_eval=1, @@ -131,77 +112,55 @@ class HermesCompatTestEnv(BaseEnv): ] return env_config, server_configs - async def setup(self): + async def setup_agent_env(self) -> None: return None async def get_next_item(self) -> Item: - # Regenerate token per task to avoid leakage across steps. - item = _build_forced_tool_item() self._iter += 1 - return item + return _forced_tool_item() - async def collect_trajectory(self, item: Item): - prompt = item.get("prompt", "") + def build_task(self, item: Item) -> str: + return str(item.get("prompt") or "") - result = await self._agent.run_conversation_async( - prompt, - task_id=str(uuid.uuid4()), + def build_agent_config(self, item: Item) -> AgentConfig: # noqa: ARG002 + # Avoid imposing max_tokens by default; tool-tag responses can be long for some models. + return AgentConfig( + max_steps=min(8, int(self.config.agent_max_steps)), + temperature=0.2, + max_tokens=None, ) - final = (result.get("final_response") or "").strip() + async def score_trajectory(self, item: Item, final_response: str) -> float: + # Scoring happens in verify_and_score_trajectory so we can inspect tool results. + _ = (item, final_response) + return 0.0 + + async def verify_and_score_trajectory( + self, + item: Item, + final_response: str, + *, + trajectory_id: str, # noqa: ARG002 + exec_tool, # noqa: ARG002 + agent_result: AgentResult | None = None, + ) -> tuple[float, Dict[str, Any]]: + if agent_result is None: + return 0.0, {"error": "Missing agent_result"} - # Verify the agent actually executed the tool by extracting stdout from the tool message. observed: str = "" - saw_tool = False - for msg in result.get("messages", []): - if msg.get("role") == "tool": - saw_tool = True - # Tool messages contain JSON strings from terminal tool. - try: - payload = json.loads(msg.get("content") or "{}") - out = (payload.get("output") or "").strip() - if out: - observed = out.splitlines()[-1].strip() - except Exception: - continue - # Pass if: - # - a tool call occurred, and - # - the final answer matches the observed stdout exactly. - score = 1.0 if saw_tool and observed and final == observed else 0.0 + tool_ok = False + for step in agent_result.steps: + for res in step.tool_results: + if not res.success: + return 0.0, {"error": res.error, "output": res.output} + out = (res.output or "").strip() + if out: + observed = out.splitlines()[-1].strip() + tool_ok = True - # Tokenization fallback: build tokens/masks from final prompt + completion. - # Note: this is sufficient for smoke testing; production training should - # use a backend that supports ManagedServer state tracking. - system_prompt = result.get("system_prompt") - messages: List[Dict[str, str]] = result.get("messages", []) - prompt_messages = messages[:-1] if messages and messages[-1].get("role") == "assistant" else messages - - if system_prompt: - prompt_messages = [{"role": "system", "content": system_prompt}] + prompt_messages - - if hasattr(self.tokenizer, "apply_chat_template"): - prompt_text = self.tokenizer.apply_chat_template( - prompt_messages, tokenize=False, add_generation_prompt=True - ) - prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False) - else: - prompt_text = "\n".join([f"{m['role']}: {m['content']}" for m in prompt_messages]) - prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=True) - - output_tokens = self.tokenizer.encode(final, add_special_tokens=False) - - scored = { - "tokens": prompt_tokens + output_tokens, - "masks": ([-100] * len(prompt_tokens)) + output_tokens, - "scores": score, - "messages": prompt_messages + [{"role": "assistant", "content": final}], - } - - return scored, [] - - async def evaluate(self, *args, **kwargs): # noqa: ARG002 - # Minimal eval hook for BaseEnv abstract method. - return {} + final = (final_response or "").strip() + score = 1.0 if tool_ok and agent_result.total_tool_calls > 0 and observed and final == observed else 0.0 + return score, {"observed": observed, "tool_calls": agent_result.total_tool_calls, "command": item.get("command")} if __name__ == "__main__": diff --git a/atropos/envs/sandbox_terminal_smoke_env.py b/atropos/envs/sandbox_terminal_smoke_env.py new file mode 100644 index 0000000000..9c140a30b6 --- /dev/null +++ b/atropos/envs/sandbox_terminal_smoke_env.py @@ -0,0 +1,169 @@ +""" +Nomad sandbox terminal smoke environment (training-oriented). + +Validates, end-to-end: + BaseEnv.process -> AgentEnv -> ToolExecutor (batched) -> Nomad SlotPool -> sandbox_server + +It forces the model to use a sandbox tool by asking it to run a command that +generates a high-entropy token inside the sandbox, then repeat it exactly. + +Run (process mode): + uv run python -m atropos.envs.sandbox_terminal_smoke_env process --env.use_wandb false --env.total_steps 2 --env.group_size 1 +""" + +from __future__ import annotations + +import os +from typing import Any, Dict, List, Tuple + +from dotenv import load_dotenv +from pydantic import Field + +from atroposlib.envs.base import APIServerConfig, Item + +from ..agent import AgentConfig, AgentResult +from ..tools import ToolCall +from .agent_env import AgentEnv, AgentEnvConfig + +load_dotenv() + +STRICT_TOOLCALL_SYSTEM_PROMPT = None + + +def _forced_tool_item() -> Item: + # Use double quotes in the shell command and show JSON escaping explicitly. + # This avoids invalid JSON escapes like `\\'` (not valid JSON) that some models produce. + cmd = 'python -c "import secrets; print(secrets.token_hex(16))"' + return { + "command": cmd, + "prompt": ( + "You MUST use the terminal tool.\n" + "Run this exact command:\n" + f"{cmd}\n" + "When you call the tool, use valid JSON inside . Example:\n" + '{"name": "terminal", "arguments": {"command": ' + '"python -c \\\\"import secrets; print(secrets.token_hex(16))\\\\""}}' + "\n" + "Then respond with EXACTLY what it printed (the hex token) and nothing else.\n" + "Do not guess. Do not explain." + ), + } + + +class SandboxTerminalSmokeEnvConfig(AgentEnvConfig): + server_base_url: str = Field( + default="http://127.0.0.1:8080", + description="Base URL for an OpenAI-compatible chat server (without /v1).", + ) + server_model: str = Field(default="glm-4.7-flash", description="Model name") + + +class SandboxTerminalSmokeEnv(AgentEnv[SandboxTerminalSmokeEnvConfig]): + name = "sandbox_terminal_smoke_env" + env_config_cls = SandboxTerminalSmokeEnvConfig + + def __init__( + self, + config: SandboxTerminalSmokeEnvConfig, + server_configs: List[APIServerConfig], + slurm: bool = False, + testing: bool = False, + ): + super().__init__(config, server_configs, slurm, testing) + self._iter = 0 + + @classmethod + def config_init(cls) -> Tuple[SandboxTerminalSmokeEnvConfig, List[APIServerConfig]]: + base_url = ( + os.getenv("ATROPOS_SERVER_BASE_URL") + or os.getenv("OPENAI_BASE_URL") + or os.getenv("LLM_BASE_URL") + or "http://127.0.0.1:8080" + ) + model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" + api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" + + env_config = SandboxTerminalSmokeEnvConfig( + tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct", # tokenization only + group_size=1, + use_wandb=False, + include_messages=True, + ensure_scores_are_not_same=False, + total_steps=2, + batch_size=1, + server_base_url=base_url, + server_model=model, + # Tooling: sandbox-only terminal. + enabled_toolsets=["terminal"], + disabled_toolsets=[], + # Default to Nomad sandboxing; users can override via --env.* args. + sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local", + purge_job_on_shutdown=True, + ) + + server_configs = [ + APIServerConfig( + model_name=model, + base_url=f"{base_url.rstrip('/')}/v1", + api_key=api_key, + num_max_requests_at_once=1, + num_requests_for_eval=1, + timeout=120, + ) + ] + return env_config, server_configs + + async def setup_agent_env(self) -> None: + return None + + async def get_next_item(self) -> Item: + self._iter += 1 + return _forced_tool_item() + + def build_task(self, item: Item) -> str: + return str(item.get("prompt") or "") + + def build_agent_config(self, item: Item) -> AgentConfig: # noqa: ARG002 + # Avoid imposing max_tokens by default; tool-tag responses can be long for some models. + return AgentConfig( + max_steps=min(8, int(self.config.agent_max_steps)), + temperature=0.2, + max_tokens=None, + system_prompt=STRICT_TOOLCALL_SYSTEM_PROMPT, + ) + + async def score_trajectory(self, item: Item, final_response: str) -> float: + # Scoring happens in verify_and_score_trajectory so we can inspect tool results. + _ = (item, final_response) + return 0.0 + + async def verify_and_score_trajectory( + self, + item: Item, + final_response: str, + *, + trajectory_id: str, # noqa: ARG002 + exec_tool, # noqa: ARG002 + agent_result: AgentResult | None = None, + ) -> tuple[float, Dict[str, Any]]: + if agent_result is None: + return 0.0, {"error": "Missing agent_result"} + + observed: str = "" + tool_ok = False + for step in agent_result.steps: + for res in step.tool_results: + if not res.success: + return 0.0, {"error": res.error, "output": res.output} + out = (res.output or "").strip() + if out: + observed = out.splitlines()[-1].strip() + tool_ok = True + + final = (final_response or "").strip() + score = 1.0 if tool_ok and agent_result.total_tool_calls > 0 and observed and final == observed else 0.0 + return score, {"observed": observed, "tool_calls": agent_result.total_tool_calls, "command": item.get("command")} + + +if __name__ == "__main__": + SandboxTerminalSmokeEnv.cli() diff --git a/atropos/envs/swe_smith_oracle_env.py b/atropos/envs/swe_smith_oracle_env.py index 756284ed2e..79f384495c 100644 --- a/atropos/envs/swe_smith_oracle_env.py +++ b/atropos/envs/swe_smith_oracle_env.py @@ -72,7 +72,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]): os.getenv("ATROPOS_SERVER_BASE_URL") or os.getenv("OPENAI_BASE_URL") or os.getenv("LLM_BASE_URL") - or "http://localhost:11434" + or "http://127.0.0.1:8080" ) model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" @@ -252,6 +252,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]): *, trajectory_id: str, exec_tool, + agent_result=None, # noqa: ARG002 ) -> tuple[float, Dict[str, Any]]: _ = trajectory_id repo_dir = self._repo_name(item) diff --git a/atropos/envs/test_env.py b/atropos/envs/test_env.py index 8ae6a2f27c..7c7f08e011 100644 --- a/atropos/envs/test_env.py +++ b/atropos/envs/test_env.py @@ -61,7 +61,7 @@ class SimpleTestEnvConfig(AgentEnvConfig): """Configuration for the simple test environment.""" server_base_url: str = Field( - default="http://localhost:11434", + default="http://127.0.0.1:8080", description="Base URL for an OpenAI-compatible server (without /v1)", ) server_model: str = Field( @@ -102,7 +102,7 @@ class SimpleTestEnv(AgentEnv[SimpleTestEnvConfig]): os.getenv("ATROPOS_SERVER_BASE_URL") or os.getenv("OPENAI_BASE_URL") or os.getenv("LLM_BASE_URL") - or "http://localhost:11434" + or "http://127.0.0.1:8080" ) model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" diff --git a/atropos/envs/toolserver_smoke_env.py b/atropos/envs/toolserver_smoke_env.py new file mode 100644 index 0000000000..4b39af468d --- /dev/null +++ b/atropos/envs/toolserver_smoke_env.py @@ -0,0 +1,162 @@ +""" +ToolServer routing smoke environment. + +Validates that: + - sandbox tools run through Nomad SlotPool (terminal -> bash in sandbox) + - external tools run through ToolServer (skills_list) + +This env uses ToolServer in-process by default (`tool_server_url="inprocess"`), +so it is self-contained for local testing. + +Run: + uv run python -m atropos.envs.toolserver_smoke_env process --env.use_wandb false --env.total_steps 1 --env.group_size 1 +""" + +from __future__ import annotations + +import os +from typing import Any, Dict, List, Tuple + +from dotenv import load_dotenv +from pydantic import Field + +from atroposlib.envs.base import APIServerConfig, Item + +from ..agent import AgentConfig, AgentResult +from .agent_env import AgentEnv, AgentEnvConfig + +load_dotenv() + + +class ToolServerSmokeEnvConfig(AgentEnvConfig): + server_base_url: str = Field( + default="http://127.0.0.1:8080", + description="Base URL for an OpenAI-compatible chat server (without /v1).", + ) + server_model: str = Field(default="glm-4.7-flash", description="Model name") + + +class ToolServerSmokeEnv(AgentEnv[ToolServerSmokeEnvConfig]): + name = "toolserver_smoke_env" + env_config_cls = ToolServerSmokeEnvConfig + + def __init__( + self, + config: ToolServerSmokeEnvConfig, + server_configs: List[APIServerConfig], + slurm: bool = False, + testing: bool = False, + ): + super().__init__(config, server_configs, slurm, testing) + self._iter = 0 + + @classmethod + def config_init(cls) -> Tuple[ToolServerSmokeEnvConfig, List[APIServerConfig]]: + base_url = ( + os.getenv("ATROPOS_SERVER_BASE_URL") + or os.getenv("OPENAI_BASE_URL") + or os.getenv("LLM_BASE_URL") + or "http://127.0.0.1:8080" + ) + model = os.getenv("ATROPOS_SERVER_MODEL") or os.getenv("LLM_MODEL") or "glm-4.7-flash" + api_key = os.getenv("ATROPOS_SERVER_API_KEY") or os.getenv("OPENAI_API_KEY") or "local" + + env_config = ToolServerSmokeEnvConfig( + tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct", # tokenization only + group_size=1, + use_wandb=False, + include_messages=True, + ensure_scores_are_not_same=False, + total_steps=1, + batch_size=1, + server_base_url=base_url, + server_model=model, + enabled_toolsets=["terminal", "skills"], + disabled_toolsets=[], + # Self-contained ToolServer for local smoke. + tool_server_url="inprocess", + sandbox_image=os.getenv("ATROPOS_SANDBOX_IMAGE") or "atropos-sandbox:local", + purge_job_on_shutdown=True, + ) + + server_configs = [ + APIServerConfig( + model_name=model, + base_url=f"{base_url.rstrip('/')}/v1", + api_key=api_key, + num_max_requests_at_once=1, + num_requests_for_eval=1, + timeout=120, + ) + ] + return env_config, server_configs + + async def setup_agent_env(self) -> None: + return None + + async def get_next_item(self) -> Item: + self._iter += 1 + return { + "prompt": ( + "You MUST call exactly one tool per assistant message.\n" + "\n" + "Step 1) Call the skills_list tool (no arguments), then stop.\n" + "Step 2) After you receive the tool response, call the terminal tool to run:\n" + "python -c \"print('ok')\"\n" + "Step 3) After you receive the terminal tool response, answer with just: ok\n" + "\n" + "Tool call format requirements:\n" + "- Every tool call MUST be a complete XML block with a closing tag.\n" + "- Do NOT emit a second in the same assistant message.\n" + "\n" + "Example:\n" + "{\"name\": \"skills_list\", \"arguments\": {}}\n" + "Do not include anything else in your final answer." + ) + } + + def build_task(self, item: Item) -> str: + return str(item.get("prompt") or "") + + def build_agent_config(self, item: Item) -> AgentConfig: # noqa: ARG002 + return AgentConfig( + max_steps=min(10, int(self.config.agent_max_steps)), + temperature=0.2, + max_tokens=None, + ) + + async def score_trajectory(self, item: Item, final_response: str) -> float: + _ = (item, final_response) + return 0.0 + + async def verify_and_score_trajectory( + self, + item: Item, + final_response: str, + *, + trajectory_id: str, # noqa: ARG002 + exec_tool, # noqa: ARG002 + agent_result: AgentResult | None = None, + ) -> tuple[float, Dict[str, Any]]: + if agent_result is None: + return 0.0, {"error": "Missing agent_result"} + + called = {c.name for s in agent_result.steps for c in s.tool_calls} + need = {"skills_list", "terminal"} + if not need.issubset(called): + return 0.0, {"error": f"Missing tool calls: {sorted(need - called)}", "called": sorted(called)} + + terminal_ok = False + for step in agent_result.steps: + for call, res in zip(step.tool_calls, step.tool_results): + if call.name != "terminal": + continue + if res.success and (res.output or "").strip().splitlines()[-1].strip() == "ok": + terminal_ok = True + + score = 1.0 if terminal_ok and (final_response or "").strip() == "ok" else 0.0 + return score, {"called": sorted(called), "final": (final_response or "").strip()} + + +if __name__ == "__main__": + ToolServerSmokeEnv.cli() diff --git a/atropos/slots/pool.py b/atropos/slots/pool.py index ba7eb683b3..d6ace16b7d 100644 --- a/atropos/slots/pool.py +++ b/atropos/slots/pool.py @@ -138,42 +138,47 @@ class SlotPool: return logger.info(f"Starting SlotPool (job_id={self.config.job_id})") - - # Check Nomad health - if not await self.nomad.is_healthy(): - raise RuntimeError(f"Nomad is not reachable at {self.config.nomad_address}") - - # Check if job exists - job = await self.nomad.get_job(self.config.job_id) - - if job is None: - # Deploy new job - logger.info(f"Deploying sandbox job: {self.config.job_id}") - job_spec = create_sandbox_job( - job_id=self.config.job_id, - image=self.config.image, - count=self.config.min_containers, - slots_per_container=self.config.slots_per_container, - privileged=self.config.privileged, - cpu=self.config.cpu, - memory=self.config.memory, - datacenter=self.config.datacenter, - ) - result = await self.nomad.submit_job(job_spec) - if "error" in result: - raise RuntimeError(f"Failed to submit job: {result}") - - # Wait for allocations to be running + + try: + # Check Nomad health + if not await self.nomad.is_healthy(): + raise RuntimeError(f"Nomad is not reachable at {self.config.nomad_address}") + + # Check if job exists + job = await self.nomad.get_job(self.config.job_id) + + if job is None: + # Deploy new job + logger.info(f"Deploying sandbox job: {self.config.job_id}") + job_spec = create_sandbox_job( + job_id=self.config.job_id, + image=self.config.image, + count=self.config.min_containers, + slots_per_container=self.config.slots_per_container, + privileged=self.config.privileged, + cpu=self.config.cpu, + memory=self.config.memory, + datacenter=self.config.datacenter, + ) + result = await self.nomad.submit_job(job_spec) + if "error" in result: + raise RuntimeError(f"Failed to submit job: {result}") + + # Wait for allocations to be running (even if the job already existed). await self._wait_for_healthy_allocations(self.config.min_containers) - - # Discover existing allocations and slots - await self._refresh_slots() - - # Start health check task - self._health_task = asyncio.create_task(self._health_check_loop()) - - self._started = True - logger.info(f"SlotPool started: {self.total_slots} slots available") + + # Discover existing allocations and slots + await self._refresh_slots() + + # Start health check task + self._health_task = asyncio.create_task(self._health_check_loop()) + + self._started = True + logger.info(f"SlotPool started: {self.total_slots} slots available") + except Exception: + # Ensure aiohttp sessions are not leaked if we fail to start. + await self.stop(purge_job=False) + raise async def stop(self, purge_job: bool = False) -> None: """ @@ -384,6 +389,19 @@ class SlotPool: """Wait for allocations to become healthy.""" import time start = time.time() + + def _summarize_alloc_detail(detail: Dict[str, Any]) -> str: + task_states = detail.get("TaskStates") or {} + parts: List[str] = [] + if isinstance(task_states, dict): + for task_name, st in task_states.items(): + events = (st or {}).get("Events") or [] + if isinstance(events, list) and events: + last = events[-1] + desc = last.get("DisplayMessage") or last.get("Message") or last.get("Type") or "" + if desc: + parts.append(f"{task_name}: {desc}") + return "; ".join(parts) while time.time() - start < timeout: allocs = await self.nomad.get_job_allocations(self.config.job_id) @@ -393,13 +411,45 @@ class SlotPool: if alloc.status == AllocationStatus.RUNNING and alloc.http_address: if await self.executor.health_check(alloc.http_address): healthy_count += 1 + + # Fast-fail on obvious driver/image errors to avoid waiting out the full timeout. + if alloc.id: + detail = await self.nomad.get_allocation(alloc.id) + if isinstance(detail, dict): + summary = _summarize_alloc_detail(detail) + lowered = summary.lower() + if "failed to pull" in lowered or "pull access denied" in lowered: + raise RuntimeError( + "Nomad allocation failed to start due to a Docker image pull error. " + f"Allocation {alloc.id[:8]}: {summary}\n" + "If you're using a local image tag (e.g. `atropos-sandbox:local`) on macOS, " + "make sure the image is loaded into Docker (build with `docker buildx build --load ...`)." + ) if healthy_count >= min_count: return await asyncio.sleep(2.0) - - raise RuntimeError(f"Timed out waiting for {min_count} healthy allocations") + + # Timed out: include allocation status detail to help debugging. + allocs = await self.nomad.get_job_allocations(self.config.job_id) + alloc_lines: List[str] = [] + for alloc in allocs[:10]: + addr = alloc.http_address or "-" + line = f"{alloc.id[:8]} status={alloc.status.value} http={addr}" + detail = await self.nomad.get_allocation(alloc.id) + if isinstance(detail, dict): + summary = _summarize_alloc_detail(detail) + if summary: + line += f" detail={summary}" + alloc_lines.append(line) + + hint = ( + "Timed out waiting for healthy sandbox allocations.\n" + f"Job: {self.config.job_id}, desired_healthy: {min_count}\n" + "Allocations:\n - " + "\n - ".join(alloc_lines) + ) + raise RuntimeError(hint) async def _try_scale_up(self) -> bool: """Attempt to scale up the job.""" diff --git a/atropos/tools/base.py b/atropos/tools/base.py index 33c9d1017a..a27b8f1ad1 100644 --- a/atropos/tools/base.py +++ b/atropos/tools/base.py @@ -72,26 +72,65 @@ class ToolCall: """ Extract tool calls from text using Hermes-style XML tags. - Format: {"name": "...", "arguments": {...}} + Supported formats (STRICT: requires well-formed closing tags): + - Hermes JSON wrapper: + {"name": "...", "arguments": {...}} + - GLM/llama.cpp style: + terminal{"command":"ls -la"} """ - calls = [] + calls: List["ToolCall"] = [] + + if not text: + return calls + + def _append_from_payload(*, name: str, arguments: Dict[str, Any], raw: str, uniq_id: Optional[str] = None) -> None: + if not isinstance(name, str) or not name: + return + if not isinstance(arguments, dict): + return + calls.append( + cls( + name=name, + arguments=arguments, + raw_text=raw, + uniq_id=uniq_id or str(uuid.uuid4()), + ) + ) + + # STRICT parsing: only accept well-formed ... blocks. pattern = r"\s*(.*?)\s*" - matches = re.findall(pattern, text, re.DOTALL) - - for match in matches: - try: - data = json.loads(match) - uniq_id = data.get("uniq_id") or data.get("id") or str(uuid.uuid4()) - calls.append(cls( + for inner in re.findall(pattern, text, re.DOTALL): + cleaned = (inner or "").strip() + if not cleaned: + continue + + # Hermes JSON wrapper. + if cleaned.startswith("{"): + try: + data = json.loads(cleaned) + except json.JSONDecodeError: + continue + uniq_id = data.get("uniq_id") or data.get("id") or None + _append_from_payload( name=data.get("name", ""), arguments=data.get("arguments", {}), - raw_text=match, + raw=inner, uniq_id=uniq_id, - )) - except json.JSONDecodeError: - # Skip malformed tool calls + ) continue - + + # GLM/llama.cpp style: terminal{...} + m = re.match(r"^\s*([A-Za-z0-9_.:\\-]+)\s*(\{.*\})\s*$", cleaned, re.DOTALL) + if not m: + continue + name = m.group(1) + args_text = m.group(2) + try: + args = json.loads(args_text) + except json.JSONDecodeError: + continue + _append_from_payload(name=name, arguments=args, raw=inner) + return calls @classmethod @@ -208,6 +247,27 @@ class ToolRegistry: """Generate tool descriptions for system prompt.""" descriptions = [tool.schema.to_prompt_description() for tool in self._tools.values()] return "\n\n".join(descriptions) + + def get_prompt_tool_definitions_json(self) -> str: + """ + Return a Hermes-style JSON list of tool definitions for use inside a `...` block. + + Hermes trajectories historically use a simplified schema list: + [{"name": ..., "description": ..., "parameters": {...}, "required": null}, ...] + """ + formatted: List[Dict[str, Any]] = [] + for tool in self._tools.values(): + fn = tool.schema.to_dict().get("function", {}) + formatted.append( + { + "name": fn.get("name", tool.name), + "description": fn.get("description", ""), + "parameters": fn.get("parameters", {}), + # Keep parity with Hermes saved trajectories (required is typically null there). + "required": None, + } + ) + return json.dumps(formatted, ensure_ascii=False) async def execute(self, call: ToolCall) -> ToolResult: """Execute a tool call.""" diff --git a/hermes_agent.egg-info/PKG-INFO b/hermes_agent.egg-info/PKG-INFO index 98fb7f6d68..d17e541623 100644 --- a/hermes_agent.egg-info/PKG-INFO +++ b/hermes_agent.egg-info/PKG-INFO @@ -29,7 +29,7 @@ Provides-Extra: dev Requires-Dist: pytest; extra == "dev" Requires-Dist: pytest-asyncio; extra == "dev" Provides-Extra: atropos -Requires-Dist: atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git ; extra == "atropos" +Requires-Dist: atroposlib @ git+https://github.com/NousResearch/atropos.git ; extra == "atropos" Requires-Dist: aiohttp; extra == "atropos" Requires-Dist: fastapi; extra == "atropos" Requires-Dist: uvicorn; extra == "atropos" diff --git a/hermes_agent.egg-info/SOURCES.txt b/hermes_agent.egg-info/SOURCES.txt index 3464034f41..87e0cd7d93 100644 --- a/hermes_agent.egg-info/SOURCES.txt +++ b/hermes_agent.egg-info/SOURCES.txt @@ -17,9 +17,10 @@ atropos/api/tool_executor_server.py atropos/api/tool_server.py atropos/envs/__init__.py atropos/envs/agent_env.py -atropos/envs/hermes_compat_test_env.py +atropos/envs/sandbox_terminal_smoke_env.py atropos/envs/swe_smith_oracle_env.py atropos/envs/test_env.py +atropos/envs/toolserver_smoke_env.py atropos/nomad/__init__.py atropos/nomad/client.py atropos/slots/__init__.py @@ -30,18 +31,13 @@ atropos/terminal/__init__.py atropos/terminal/asciinema_stream.py atropos/tools/__init__.py atropos/tools/base.py -atropos/tools/basic_tools.py -atropos/tools/image_generation_tool.py -atropos/tools/mixture_of_agents_tool.py -atropos/tools/terminal_hecate.py +atropos/tools/build_registry.py +atropos/tools/hermes_external_tools.py +atropos/tools/sandbox_stubs.py atropos/tools/terminal_stateful_tool.py -atropos/tools/terminal_tool.py atropos/tools/tmux_tool.py atropos/tools/tool_executor.py -atropos/tools/toolset_distributions.py -atropos/tools/toolsets.py -atropos/tools/vision_tools.py -atropos/tools/web_tools.py +atropos/tools/toolset_resolver.py hermes_agent.egg-info/PKG-INFO hermes_agent.egg-info/SOURCES.txt hermes_agent.egg-info/dependency_links.txt diff --git a/hermes_agent.egg-info/entry_points.txt b/hermes_agent.egg-info/entry_points.txt index 2e72c8e210..42fd7548f8 100644 --- a/hermes_agent.egg-info/entry_points.txt +++ b/hermes_agent.egg-info/entry_points.txt @@ -1,2 +1,4 @@ [console_scripts] hermes-agent = run_agent:main +hermes-atropos-sandbox-smoke = atropos.envs.sandbox_terminal_smoke_env:SandboxTerminalSmokeEnv.cli +hermes-atropos-toolserver-smoke = atropos.envs.toolserver_smoke_env:ToolServerSmokeEnv.cli diff --git a/hermes_agent.egg-info/requires.txt b/hermes_agent.egg-info/requires.txt index 0ef7437426..6e9ad30e5a 100644 --- a/hermes_agent.egg-info/requires.txt +++ b/hermes_agent.egg-info/requires.txt @@ -16,7 +16,7 @@ typer platformdirs [atropos] -atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git +atroposlib @ git+https://github.com/NousResearch/atropos.git aiohttp fastapi uvicorn diff --git a/pyproject.toml b/pyproject.toml index aa78046db3..328a16a0fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ modal = ["modal", "boto3"] dev = ["pytest", "pytest-asyncio"] # Install Atropos from source (PyPI is often stale for this internal dependency). atropos = [ - "atroposlib @ git+ssh://git@github.com/NousResearch/atropos.git", + "atroposlib @ git+https://github.com/NousResearch/atropos.git", # Atropos integration runtime deps (kept optional for Hermes-only users) "aiohttp", "fastapi", @@ -47,6 +47,8 @@ atropos = [ [project.scripts] hermes-agent = "run_agent:main" +hermes-atropos-sandbox-smoke = "atropos.envs.sandbox_terminal_smoke_env:SandboxTerminalSmokeEnv.cli" +hermes-atropos-toolserver-smoke = "atropos.envs.toolserver_smoke_env:ToolServerSmokeEnv.cli" [tool.setuptools] py-modules = [ diff --git a/scripts/launch_llama_cpp_glm47_flash.sh b/scripts/launch_llama_cpp_glm47_flash.sh new file mode 100755 index 0000000000..3f3716a2fa --- /dev/null +++ b/scripts/launch_llama_cpp_glm47_flash.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Launch a local llama.cpp OpenAI-compatible server running GLM-4.7-Flash (GGUF). +# +# Requires: +# - `llama-server` installed (e.g. `brew install llama.cpp`) +# +# Default settings are chosen to avoid clashing with Atropos sandbox_server +# (which commonly uses port 8080 in local dev). +# +# Usage: +# Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh +# +# Override defaults: +# LLAMA_CPP_HOST=127.0.0.1 LLAMA_CPP_PORT=8082 \ +# LLAMA_CPP_HF_REPO=ggml-org/GLM-4.7-Flash-GGUF \ +# LLAMA_CPP_HF_FILE=GLM-4.7-Flash-Q4_K.gguf \ +# Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh + +HOST="${LLAMA_CPP_HOST:-127.0.0.1}" +PORT="${LLAMA_CPP_PORT:-8080}" +HF_REPO="${LLAMA_CPP_HF_REPO:-ggml-org/GLM-4.7-Flash-GGUF}" +HF_FILE="${LLAMA_CPP_HF_FILE:-GLM-4.7-Flash-Q4_K.gguf}" +ALIAS="${LLAMA_CPP_ALIAS:-glm-4.7-flash}" + +if ! command -v llama-server >/dev/null 2>&1; then + echo "Error: llama-server not found in PATH." + echo "Install via Homebrew: brew install llama.cpp" + exit 1 +fi + +echo "Launching llama.cpp server..." +echo " host: $HOST" +echo " port: $PORT" +echo " repo: $HF_REPO" +echo " file: $HF_FILE" +echo " alias: $ALIAS" +echo +echo "Suggested env vars for Hermes/Atropos integration:" +echo " export ATROPOS_SERVER_BASE_URL=http://${HOST}:${PORT}" +echo " export ATROPOS_SERVER_MODEL=${ALIAS}" +echo " export ATROPOS_SERVER_API_KEY=local" +echo + +if command -v lsof >/dev/null 2>&1; then + if lsof -nP -iTCP:"$PORT" -sTCP:LISTEN >/dev/null 2>&1; then + echo "Error: port $PORT is already in use." + echo "Pick a different port, e.g.:" + echo " LLAMA_CPP_PORT=8082 Hermes-Agent/scripts/launch_llama_cpp_glm47_flash.sh" + exit 1 + fi +fi + +exec llama-server \ + --host "$HOST" \ + --port "$PORT" \ + --hf-repo "$HF_REPO" \ + --hf-file "$HF_FILE" \ + --alias "$ALIAS" \ + -c 32768 \ + -n -1 diff --git a/tests/test_tool_call_parsing.py b/tests/test_tool_call_parsing.py new file mode 100644 index 0000000000..49db864b24 --- /dev/null +++ b/tests/test_tool_call_parsing.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from atropos.tools.base import ToolCall + + +def test_parse_tool_call_json_wrapper() -> None: + text = '{"name":"terminal","arguments":{"command":"pwd"}}' + calls = ToolCall.parse_from_text(text) + assert len(calls) == 1 + assert calls[0].name == "terminal" + assert calls[0].arguments == {"command": "pwd"} + + +def test_parse_tool_call_glm_style() -> None: + text = 'terminal{"command":"ls -la"}' + calls = ToolCall.parse_from_text(text) + assert len(calls) == 1 + assert calls[0].name == "terminal" + assert calls[0].arguments == {"command": "ls -la"} + + +def test_parse_tool_call_missing_close_tag() -> None: + text = 'terminal{"command":"echo hi"}' + calls = ToolCall.parse_from_text(text) + assert calls == [] + + +def test_parse_tool_call_strips_accidental_xml() -> None: + text = 'terminal{"command":"ls -la"}' + calls = ToolCall.parse_from_text(text) + assert calls == [] diff --git a/uv.lock b/uv.lock index 52eaaccbe0..0cb6f730fc 100644 --- a/uv.lock +++ b/uv.lock @@ -218,7 +218,7 @@ wheels = [ [[package]] name = "atroposlib" version = "0.3.0" -source = { git = "ssh://git@github.com/NousResearch/atropos.git#462abbebf75f44e811116c3730ce9874c4358a80" } +source = { git = "https://github.com/NousResearch/atropos.git#462abbebf75f44e811116c3730ce9874c4358a80" } dependencies = [ { name = "aiofiles" }, { name = "aiohttp" }, @@ -913,7 +913,7 @@ modal = [ [package.metadata] requires-dist = [ { name = "aiohttp", marker = "extra == 'atropos'" }, - { name = "atroposlib", marker = "extra == 'atropos'", git = "ssh://git@github.com/NousResearch/atropos.git" }, + { name = "atroposlib", marker = "extra == 'atropos'", git = "https://github.com/NousResearch/atropos.git" }, { name = "boto3", marker = "extra == 'modal'" }, { name = "fal-client" }, { name = "fastapi", marker = "extra == 'atropos'" },