mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fixed infinite loop on agent errors
This commit is contained in:
parent
5a9c98a771
commit
7130fa50cb
4 changed files with 144 additions and 23 deletions
|
|
@ -34,20 +34,23 @@ load_dotenv()
|
||||||
#
|
#
|
||||||
# IMPORTANT: In training-mode environments we want "raw text in -> raw text out" and we
|
# IMPORTANT: In training-mode environments we want "raw text in -> raw text out" and we
|
||||||
# parse tool calls from completion text. Do not rely on server-specific `tool_calls` fields.
|
# parse tool calls from completion text. Do not rely on server-specific `tool_calls` fields.
|
||||||
AGENT_SYSTEM_PROMPT = """You are a function-calling AI model.
|
AGENT_SYSTEM_PROMPT = """You are a deep thinking AI. You MUST enclose your internal reasoning inside <think>...</think> tags.
|
||||||
|
|
||||||
|
You are a function calling AI model.
|
||||||
|
|
||||||
You are provided with function signatures within <tools></tools> XML tags.
|
You are provided with function signatures within <tools></tools> XML tags.
|
||||||
You may call one or more functions to assist with the user query. If available tools are not relevant,
|
You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.
|
||||||
respond in natural language.
|
|
||||||
|
|
||||||
After calling & executing a function, you will be provided with function results within
|
After calling & executing a function, you will be provided with function results within <tool_response></tool_response> XML tags.
|
||||||
<tool_response></tool_response> XML tags.
|
|
||||||
|
|
||||||
Here are the available tools:
|
Here are the available tools:
|
||||||
<tools>
|
<tools>
|
||||||
{tools_json}
|
{tools_json}
|
||||||
</tools>
|
</tools>
|
||||||
|
|
||||||
|
Use the following JSON schema for each tool call you will make:
|
||||||
|
{"title": "FunctionCall", "type": "object", "properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"]}
|
||||||
|
|
||||||
## REQUIRED TOOL FORMAT
|
## REQUIRED TOOL FORMAT
|
||||||
|
|
||||||
When you decide to call a tool, your assistant message MUST be:
|
When you decide to call a tool, your assistant message MUST be:
|
||||||
|
|
@ -55,10 +58,14 @@ When you decide to call a tool, your assistant message MUST be:
|
||||||
2) one or more <tool_call>...</tool_call> blocks,
|
2) one or more <tool_call>...</tool_call> blocks,
|
||||||
and NOTHING else in that message.
|
and NOTHING else in that message.
|
||||||
|
|
||||||
For each tool call, output a JSON object with this schema:
|
If you need to explain anything, put it inside <think>. Do NOT write natural language outside <think> or <tool_call>.
|
||||||
{"name": "function_name", "arguments": { ... }}
|
|
||||||
|
|
||||||
Each tool call MUST be enclosed within <tool_call></tool_call> XML tags.
|
For each function call return a JSON object with function name and arguments within <tool_call></tool_call> XML tags as follows:
|
||||||
|
<tool_call>
|
||||||
|
{"name": "<function-name>", "arguments": {"arg1": "value1"}}
|
||||||
|
</tool_call>
|
||||||
|
|
||||||
|
Each <tool_call> must be on its own and contain ONLY the JSON object (no extra text).
|
||||||
The JSON inside <tool_call> MUST be valid JSON with double quotes.
|
The JSON inside <tool_call> MUST be valid JSON with double quotes.
|
||||||
|
|
||||||
Do NOT output <tool_response> in an assistant message.
|
Do NOT output <tool_response> in an assistant message.
|
||||||
|
|
@ -66,27 +73,44 @@ Do NOT output <tool_response> in an assistant message.
|
||||||
After you receive tool results, you may either call more tools (same required format) or provide the final answer.
|
After you receive tool results, you may either call more tools (same required format) or provide the final answer.
|
||||||
When providing the final answer, do NOT include any <tool_call> blocks.
|
When providing the final answer, do NOT include any <tool_call> blocks.
|
||||||
|
|
||||||
|
## TERMINAL TOOL NOTES
|
||||||
|
|
||||||
|
- Commands execute under POSIX `/bin/sh` (not bash).
|
||||||
|
- Each tool call runs in a fresh shell: environment changes (like `cd` or venv activation) do not persist across tool calls.
|
||||||
|
- Avoid bash-only features like `source`, `[[ ... ]]`, or process substitution.
|
||||||
|
- Prefer explicit venv usage:
|
||||||
|
- `python -m venv .venv && . .venv/bin/activate && python -m pip install -e .` (POSIX `.` activation), or
|
||||||
|
- `.venv/bin/python -m pip install -e .` (no activation required).
|
||||||
|
|
||||||
## ICL (examples)
|
## ICL (examples)
|
||||||
|
|
||||||
User: Show the current directory.
|
User: Show the current directory.
|
||||||
Assistant:
|
Assistant:
|
||||||
<think>I should use the terminal tool to print the current directory.</think>
|
<think>I should run pwd.</think>
|
||||||
<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>
|
<tool_call>
|
||||||
|
{"name": "terminal", "arguments": {"command": "pwd"}}
|
||||||
|
</tool_call>
|
||||||
User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
|
User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
|
||||||
Assistant: /tmp
|
Assistant: /tmp
|
||||||
|
|
||||||
User: List files, then count them.
|
User: List files, then count them.
|
||||||
Assistant:
|
Assistant:
|
||||||
<think>I should list files and count lines.</think>
|
<think>I should count files.</think>
|
||||||
<tool_call>{"name": "terminal", "arguments": {"command": "ls -1 | wc -l"}}</tool_call>
|
<tool_call>
|
||||||
|
{"name": "terminal", "arguments": {"command": "ls -1 | wc -l"}}
|
||||||
|
</tool_call>
|
||||||
User: <tool_response>{"success": true, "output": "3\\n"}</tool_response>
|
User: <tool_response>{"success": true, "output": "3\\n"}</tool_response>
|
||||||
Assistant: 3
|
Assistant: 3
|
||||||
|
|
||||||
User: Run pwd, then print ok.
|
User: Run pwd, then print ok (two tool calls).
|
||||||
Assistant:
|
Assistant:
|
||||||
<think>I should run pwd, then run a command that prints ok.</think>
|
<think>I should run two commands.</think>
|
||||||
<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>
|
<tool_call>
|
||||||
<tool_call>{"name": "terminal", "arguments": {"command": "echo ok"}}</tool_call>
|
{"name": "terminal", "arguments": {"command": "pwd"}}
|
||||||
|
</tool_call>
|
||||||
|
<tool_call>
|
||||||
|
{"name": "terminal", "arguments": {"command": "echo ok"}}
|
||||||
|
</tool_call>
|
||||||
User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
|
User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
|
||||||
User: <tool_response>{"success": true, "output": "ok\\n"}</tool_response>
|
User: <tool_response>{"success": true, "output": "ok\\n"}</tool_response>
|
||||||
Assistant: ok
|
Assistant: ok
|
||||||
|
|
@ -337,6 +361,9 @@ class AtroposAgent:
|
||||||
final_response = ""
|
final_response = ""
|
||||||
final_node = None
|
final_node = None
|
||||||
final_prompt_messages: Optional[List[Dict[str, str]]] = None
|
final_prompt_messages: Optional[List[Dict[str, str]]] = None
|
||||||
|
last_node = None
|
||||||
|
last_prompt_messages: Optional[List[Dict[str, str]]] = None
|
||||||
|
last_response_text: str = ""
|
||||||
|
|
||||||
# Use ManagedServer for automatic token tracking
|
# Use ManagedServer for automatic token tracking
|
||||||
async with self._managed() as managed:
|
async with self._managed() as managed:
|
||||||
|
|
@ -384,6 +411,9 @@ class AtroposAgent:
|
||||||
# Some OpenAI-compatible servers populate `message.reasoning` and leave `content=""`.
|
# Some OpenAI-compatible servers populate `message.reasoning` and leave `content=""`.
|
||||||
response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "")
|
response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "")
|
||||||
tool_calls = ToolCall.parse_from_text(response_text)
|
tool_calls = ToolCall.parse_from_text(response_text)
|
||||||
|
last_node = current_node
|
||||||
|
last_prompt_messages = prompt_messages
|
||||||
|
last_response_text = response_text
|
||||||
|
|
||||||
step = AgentStep(
|
step = AgentStep(
|
||||||
step_number=step_num + 1,
|
step_number=step_num + 1,
|
||||||
|
|
@ -419,11 +449,39 @@ class AtroposAgent:
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Reached max steps without completing
|
# Reached max steps without completing
|
||||||
|
# Return a failure result but include the last observed completion so callers can
|
||||||
|
# record the trajectory (score=0) without triggering retries.
|
||||||
|
final_response = last_response_text or final_response
|
||||||
|
final_node = last_node
|
||||||
|
final_prompt_messages = last_prompt_messages
|
||||||
|
trajectory_data = None
|
||||||
|
if final_node:
|
||||||
|
trajectory_data = SequenceData.from_sequence_node(final_node)
|
||||||
|
elif final_prompt_messages is not None and self.tokenizer is not None:
|
||||||
|
if hasattr(self.tokenizer, "apply_chat_template"):
|
||||||
|
prompt_text = self.tokenizer.apply_chat_template(
|
||||||
|
final_prompt_messages, tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
|
prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
|
||||||
|
else:
|
||||||
|
prompt_text = "\n".join([f"{m['role']}: {m['content']}" for m in final_prompt_messages])
|
||||||
|
prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=True)
|
||||||
|
output_tokens = self.tokenizer.encode(final_response, add_special_tokens=False)
|
||||||
|
tokens = prompt_tokens + output_tokens
|
||||||
|
masked_tokens = ([-100] * len(prompt_tokens)) + output_tokens
|
||||||
|
logprobs = ([1.0] * len(prompt_tokens)) + ([0.0] * len(output_tokens))
|
||||||
|
trajectory_data = SequenceData(
|
||||||
|
full_text=f"{prompt_text}{final_response}",
|
||||||
|
tokens=tokens,
|
||||||
|
masked_tokens=masked_tokens,
|
||||||
|
logprobs=logprobs,
|
||||||
|
)
|
||||||
return AgentResult(
|
return AgentResult(
|
||||||
success=False,
|
success=False,
|
||||||
final_response=final_response,
|
final_response=final_response,
|
||||||
steps=steps,
|
steps=steps,
|
||||||
error=f"Reached maximum steps ({self.config.max_steps})",
|
error=f"Reached maximum steps ({self.config.max_steps})",
|
||||||
|
trajectory_data=trajectory_data,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build result with trajectory data
|
# Build result with trajectory data
|
||||||
|
|
|
||||||
|
|
@ -362,10 +362,37 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
if not result.success or result.trajectory_data is None:
|
if not result.success or result.trajectory_data is None:
|
||||||
return None, []
|
# Do not trigger BaseEnv retries for agent failures.
|
||||||
|
# Record the trajectory with score 0.0 so training/eval can see the failure mode.
|
||||||
|
messages = [{"role": "system", "content": agent._build_system_prompt()}] # noqa: SLF001
|
||||||
|
messages.append({"role": "user", "content": task})
|
||||||
|
for step in result.steps:
|
||||||
|
messages.append({"role": "assistant", "content": step.assistant_message})
|
||||||
|
if step.tool_results:
|
||||||
|
tool_text = "\n".join(r.to_xml() for r in step.tool_results)
|
||||||
|
messages.append({"role": "user", "content": tool_text})
|
||||||
|
|
||||||
|
scored: ScoredDataItem = {
|
||||||
|
"tokens": (result.trajectory_data.tokens if result.trajectory_data else []),
|
||||||
|
"masks": (result.trajectory_data.masked_tokens if result.trajectory_data else []),
|
||||||
|
"scores": 0.0,
|
||||||
|
}
|
||||||
|
if self.config.include_messages:
|
||||||
|
# Record a final failure marker as a user-side tool_response-like block so it survives templates.
|
||||||
|
import json
|
||||||
|
|
||||||
|
err = result.error or "agent_failed"
|
||||||
|
messages.append(
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"<tool_response>{json.dumps({'success': False, 'error': err})}</tool_response>",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
scored["messages"] = messages
|
||||||
|
return scored, []
|
||||||
|
|
||||||
print(f"[AgentEnv] tid={trajectory_id} verify_and_score_trajectory() start", flush=True)
|
print(f"[AgentEnv] tid={trajectory_id} verify_and_score_trajectory() start", flush=True)
|
||||||
score, _score_metadata = await self.verify_and_score_trajectory(
|
score, score_metadata = await self.verify_and_score_trajectory(
|
||||||
item,
|
item,
|
||||||
result.final_response,
|
result.final_response,
|
||||||
trajectory_id=trajectory_id,
|
trajectory_id=trajectory_id,
|
||||||
|
|
@ -387,6 +414,14 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
|
||||||
tool_text = "\n".join(r.to_xml() for r in step.tool_results)
|
tool_text = "\n".join(r.to_xml() for r in step.tool_results)
|
||||||
messages.append({"role": "user", "content": tool_text})
|
messages.append({"role": "user", "content": tool_text})
|
||||||
|
|
||||||
|
# Optional: allow env verification to attach additional messages (e.g. install logs).
|
||||||
|
if self.config.include_messages and isinstance(score_metadata, dict):
|
||||||
|
extra = score_metadata.get("verification_messages")
|
||||||
|
if isinstance(extra, list):
|
||||||
|
for m in extra:
|
||||||
|
if isinstance(m, dict) and isinstance(m.get("role"), str) and isinstance(m.get("content"), str):
|
||||||
|
messages.append({"role": m["role"], "content": m["content"]})
|
||||||
|
|
||||||
scored: ScoredDataItem = {
|
scored: ScoredDataItem = {
|
||||||
"tokens": result.trajectory_data.tokens,
|
"tokens": result.trajectory_data.tokens,
|
||||||
"masks": result.trajectory_data.masked_tokens,
|
"masks": result.trajectory_data.masked_tokens,
|
||||||
|
|
|
||||||
|
|
@ -234,6 +234,8 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
||||||
"Constraints:\n"
|
"Constraints:\n"
|
||||||
"- Use a workspace-local virtualenv (e.g. inside the repo at ./.venv) to avoid cross-run contamination.\n"
|
"- Use a workspace-local virtualenv (e.g. inside the repo at ./.venv) to avoid cross-run contamination.\n"
|
||||||
"- Use non-interactive commands only.\n\n"
|
"- Use non-interactive commands only.\n\n"
|
||||||
|
"- Terminal commands run under POSIX /bin/sh and each tool call runs in a fresh shell (no persisted env vars).\n"
|
||||||
|
" Avoid bash-only `source`; prefer `. .venv/bin/activate` or `.venv/bin/python ...`.\n\n"
|
||||||
f"{verify_note}\n"
|
f"{verify_note}\n"
|
||||||
f"{trunc_note}\n"
|
f"{trunc_note}\n"
|
||||||
"Problem statement:\n"
|
"Problem statement:\n"
|
||||||
|
|
@ -365,13 +367,20 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
||||||
*,
|
*,
|
||||||
trajectory_id: str,
|
trajectory_id: str,
|
||||||
exec_tool,
|
exec_tool,
|
||||||
agent_result=None, # noqa: ARG002
|
agent_result=None,
|
||||||
workspace_meta: Optional[Dict[str, Any]] = None,
|
workspace_meta: Optional[Dict[str, Any]] = None,
|
||||||
) -> tuple[float, Dict[str, Any]]:
|
) -> tuple[float, Dict[str, Any]]:
|
||||||
_ = trajectory_id
|
_ = trajectory_id
|
||||||
repo_dir = self._repo_name(item)
|
repo_dir = self._repo_name(item)
|
||||||
|
|
||||||
if self.config.verification_mode == "install":
|
if self.config.verification_mode == "install":
|
||||||
|
# Training correctness: do not reward trajectories that never actually used tools.
|
||||||
|
if agent_result is not None and getattr(agent_result, "total_tool_calls", 0) <= 0:
|
||||||
|
return 0.0, {
|
||||||
|
"verification_mode": "install",
|
||||||
|
"error": "No tool calls were made by the agent",
|
||||||
|
}
|
||||||
|
|
||||||
print(f"[SweSmithOracleEnv] tid={trajectory_id} verify (install): running pip install -e .", flush=True)
|
print(f"[SweSmithOracleEnv] tid={trajectory_id} verify (install): running pip install -e .", flush=True)
|
||||||
t0 = time.perf_counter()
|
t0 = time.perf_counter()
|
||||||
install_cmd = (
|
install_cmd = (
|
||||||
|
|
@ -394,6 +403,14 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
||||||
"verification_mode": "install",
|
"verification_mode": "install",
|
||||||
"install_success": ok,
|
"install_success": ok,
|
||||||
"error": res.error,
|
"error": res.error,
|
||||||
|
"verification_messages": [{"role": "user", "content": res.to_xml()}],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Training correctness: do not reward trajectories that never actually used tools.
|
||||||
|
if agent_result is not None and getattr(agent_result, "total_tool_calls", 0) <= 0:
|
||||||
|
return 0.0, {
|
||||||
|
"verification_mode": "pytest",
|
||||||
|
"error": "No tool calls were made by the agent",
|
||||||
}
|
}
|
||||||
|
|
||||||
nodeids = self._tests_for_item(item)
|
nodeids = self._tests_for_item(item)
|
||||||
|
|
@ -412,12 +429,14 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
||||||
setup_res = await exec_tool(
|
setup_res = await exec_tool(
|
||||||
ToolCall(name="terminal", arguments={"command": setup_cmd, "timeout": self.config.install_timeout_s})
|
ToolCall(name="terminal", arguments={"command": setup_cmd, "timeout": self.config.install_timeout_s})
|
||||||
)
|
)
|
||||||
|
verification_messages = [{"role": "user", "content": setup_res.to_xml()}]
|
||||||
if not setup_res.success:
|
if not setup_res.success:
|
||||||
return 0.0, {
|
return 0.0, {
|
||||||
"verification_mode": "pytest",
|
"verification_mode": "pytest",
|
||||||
"phase": "install",
|
"phase": "install",
|
||||||
"error": setup_res.error,
|
"error": setup_res.error,
|
||||||
"output": setup_res.output,
|
"output": setup_res.output,
|
||||||
|
"verification_messages": verification_messages,
|
||||||
}
|
}
|
||||||
|
|
||||||
chunks = self._chunk_nodeids(nodeids, max_per_chunk=50)
|
chunks = self._chunk_nodeids(nodeids, max_per_chunk=50)
|
||||||
|
|
@ -430,10 +449,18 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
||||||
arguments={"command": cmd, "timeout": self.config.test_timeout_s},
|
arguments={"command": cmd, "timeout": self.config.test_timeout_s},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
verification_messages.append({"role": "user", "content": res.to_xml()})
|
||||||
if not res.success:
|
if not res.success:
|
||||||
return 0.0, {"failed_chunk": chunk_idx, "error": res.error, "output": res.output}
|
return 0.0, {
|
||||||
|
"verification_mode": "pytest",
|
||||||
|
"phase": "pytest",
|
||||||
|
"failed_chunk": chunk_idx,
|
||||||
|
"error": res.error,
|
||||||
|
"output": res.output,
|
||||||
|
"verification_messages": verification_messages,
|
||||||
|
}
|
||||||
|
|
||||||
return 1.0, {"verification_mode": "pytest", "passed": True}
|
return 1.0, {"verification_mode": "pytest", "passed": True, "verification_messages": verification_messages}
|
||||||
|
|
||||||
async def score_trajectory(self, item: Item, final_response: str) -> float:
|
async def score_trajectory(self, item: Item, final_response: str) -> float:
|
||||||
# Not used; scoring happens in verify_and_score_trajectory.
|
# Not used; scoring happens in verify_and_score_trajectory.
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,9 @@ class TerminalTool(Tool):
|
||||||
description=(
|
description=(
|
||||||
"Execute a command inside the sandbox slot workspace and return stdout/stderr. "
|
"Execute a command inside the sandbox slot workspace and return stdout/stderr. "
|
||||||
"Filesystem persists within a trajectory slot. Background processes are not supported "
|
"Filesystem persists within a trajectory slot. Background processes are not supported "
|
||||||
"in stateless mode."
|
"in stateless mode. Commands run under POSIX /bin/sh and each tool call runs in a fresh "
|
||||||
|
"shell (no persisted env vars). Avoid bash-only syntax like `source`; prefer `. .venv/bin/activate` "
|
||||||
|
"or invoke `.venv/bin/python ...` directly."
|
||||||
),
|
),
|
||||||
parameters={
|
parameters={
|
||||||
"command": {"type": "string", "description": "The command to execute"},
|
"command": {"type": "string", "description": "The command to execute"},
|
||||||
|
|
@ -95,4 +97,3 @@ class WriteFileTool(Tool):
|
||||||
|
|
||||||
async def execute(self, **_kwargs) -> ToolResult:
|
async def execute(self, **_kwargs) -> ToolResult:
|
||||||
return ToolResult(success=False, error="write_file must be executed via ToolExecutor inside the sandbox")
|
return ToolResult(success=False, error="write_file must be executed via ToolExecutor inside the sandbox")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue