mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fixed infinite loop on agent errors
This commit is contained in:
parent
5a9c98a771
commit
7130fa50cb
4 changed files with 144 additions and 23 deletions
|
|
@ -34,20 +34,23 @@ load_dotenv()
|
|||
#
|
||||
# IMPORTANT: In training-mode environments we want "raw text in -> raw text out" and we
|
||||
# parse tool calls from completion text. Do not rely on server-specific `tool_calls` fields.
|
||||
AGENT_SYSTEM_PROMPT = """You are a function-calling AI model.
|
||||
AGENT_SYSTEM_PROMPT = """You are a deep thinking AI. You MUST enclose your internal reasoning inside <think>...</think> tags.
|
||||
|
||||
You are a function calling AI model.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags.
|
||||
You may call one or more functions to assist with the user query. If available tools are not relevant,
|
||||
respond in natural language.
|
||||
You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.
|
||||
|
||||
After calling & executing a function, you will be provided with function results within
|
||||
<tool_response></tool_response> XML tags.
|
||||
After calling & executing a function, you will be provided with function results within <tool_response></tool_response> XML tags.
|
||||
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
{tools_json}
|
||||
</tools>
|
||||
|
||||
Use the following JSON schema for each tool call you will make:
|
||||
{"title": "FunctionCall", "type": "object", "properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"]}
|
||||
|
||||
## REQUIRED TOOL FORMAT
|
||||
|
||||
When you decide to call a tool, your assistant message MUST be:
|
||||
|
|
@ -55,10 +58,14 @@ When you decide to call a tool, your assistant message MUST be:
|
|||
2) one or more <tool_call>...</tool_call> blocks,
|
||||
and NOTHING else in that message.
|
||||
|
||||
For each tool call, output a JSON object with this schema:
|
||||
{"name": "function_name", "arguments": { ... }}
|
||||
If you need to explain anything, put it inside <think>. Do NOT write natural language outside <think> or <tool_call>.
|
||||
|
||||
Each tool call MUST be enclosed within <tool_call></tool_call> XML tags.
|
||||
For each function call return a JSON object with function name and arguments within <tool_call></tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"name": "<function-name>", "arguments": {"arg1": "value1"}}
|
||||
</tool_call>
|
||||
|
||||
Each <tool_call> must be on its own and contain ONLY the JSON object (no extra text).
|
||||
The JSON inside <tool_call> MUST be valid JSON with double quotes.
|
||||
|
||||
Do NOT output <tool_response> in an assistant message.
|
||||
|
|
@ -66,27 +73,44 @@ Do NOT output <tool_response> in an assistant message.
|
|||
After you receive tool results, you may either call more tools (same required format) or provide the final answer.
|
||||
When providing the final answer, do NOT include any <tool_call> blocks.
|
||||
|
||||
## TERMINAL TOOL NOTES
|
||||
|
||||
- Commands execute under POSIX `/bin/sh` (not bash).
|
||||
- Each tool call runs in a fresh shell: environment changes (like `cd` or venv activation) do not persist across tool calls.
|
||||
- Avoid bash-only features like `source`, `[[ ... ]]`, or process substitution.
|
||||
- Prefer explicit venv usage:
|
||||
- `python -m venv .venv && . .venv/bin/activate && python -m pip install -e .` (POSIX `.` activation), or
|
||||
- `.venv/bin/python -m pip install -e .` (no activation required).
|
||||
|
||||
## ICL (examples)
|
||||
|
||||
User: Show the current directory.
|
||||
Assistant:
|
||||
<think>I should use the terminal tool to print the current directory.</think>
|
||||
<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>
|
||||
<think>I should run pwd.</think>
|
||||
<tool_call>
|
||||
{"name": "terminal", "arguments": {"command": "pwd"}}
|
||||
</tool_call>
|
||||
User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
|
||||
Assistant: /tmp
|
||||
|
||||
User: List files, then count them.
|
||||
Assistant:
|
||||
<think>I should list files and count lines.</think>
|
||||
<tool_call>{"name": "terminal", "arguments": {"command": "ls -1 | wc -l"}}</tool_call>
|
||||
<think>I should count files.</think>
|
||||
<tool_call>
|
||||
{"name": "terminal", "arguments": {"command": "ls -1 | wc -l"}}
|
||||
</tool_call>
|
||||
User: <tool_response>{"success": true, "output": "3\\n"}</tool_response>
|
||||
Assistant: 3
|
||||
|
||||
User: Run pwd, then print ok.
|
||||
User: Run pwd, then print ok (two tool calls).
|
||||
Assistant:
|
||||
<think>I should run pwd, then run a command that prints ok.</think>
|
||||
<tool_call>{"name": "terminal", "arguments": {"command": "pwd"}}</tool_call>
|
||||
<tool_call>{"name": "terminal", "arguments": {"command": "echo ok"}}</tool_call>
|
||||
<think>I should run two commands.</think>
|
||||
<tool_call>
|
||||
{"name": "terminal", "arguments": {"command": "pwd"}}
|
||||
</tool_call>
|
||||
<tool_call>
|
||||
{"name": "terminal", "arguments": {"command": "echo ok"}}
|
||||
</tool_call>
|
||||
User: <tool_response>{"success": true, "output": "/tmp\\n"}</tool_response>
|
||||
User: <tool_response>{"success": true, "output": "ok\\n"}</tool_response>
|
||||
Assistant: ok
|
||||
|
|
@ -337,6 +361,9 @@ class AtroposAgent:
|
|||
final_response = ""
|
||||
final_node = None
|
||||
final_prompt_messages: Optional[List[Dict[str, str]]] = None
|
||||
last_node = None
|
||||
last_prompt_messages: Optional[List[Dict[str, str]]] = None
|
||||
last_response_text: str = ""
|
||||
|
||||
# Use ManagedServer for automatic token tracking
|
||||
async with self._managed() as managed:
|
||||
|
|
@ -384,6 +411,9 @@ class AtroposAgent:
|
|||
# Some OpenAI-compatible servers populate `message.reasoning` and leave `content=""`.
|
||||
response_text = (msg.content or "") or (getattr(msg, "reasoning", None) or "")
|
||||
tool_calls = ToolCall.parse_from_text(response_text)
|
||||
last_node = current_node
|
||||
last_prompt_messages = prompt_messages
|
||||
last_response_text = response_text
|
||||
|
||||
step = AgentStep(
|
||||
step_number=step_num + 1,
|
||||
|
|
@ -419,11 +449,39 @@ class AtroposAgent:
|
|||
|
||||
else:
|
||||
# Reached max steps without completing
|
||||
# Return a failure result but include the last observed completion so callers can
|
||||
# record the trajectory (score=0) without triggering retries.
|
||||
final_response = last_response_text or final_response
|
||||
final_node = last_node
|
||||
final_prompt_messages = last_prompt_messages
|
||||
trajectory_data = None
|
||||
if final_node:
|
||||
trajectory_data = SequenceData.from_sequence_node(final_node)
|
||||
elif final_prompt_messages is not None and self.tokenizer is not None:
|
||||
if hasattr(self.tokenizer, "apply_chat_template"):
|
||||
prompt_text = self.tokenizer.apply_chat_template(
|
||||
final_prompt_messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
|
||||
else:
|
||||
prompt_text = "\n".join([f"{m['role']}: {m['content']}" for m in final_prompt_messages])
|
||||
prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=True)
|
||||
output_tokens = self.tokenizer.encode(final_response, add_special_tokens=False)
|
||||
tokens = prompt_tokens + output_tokens
|
||||
masked_tokens = ([-100] * len(prompt_tokens)) + output_tokens
|
||||
logprobs = ([1.0] * len(prompt_tokens)) + ([0.0] * len(output_tokens))
|
||||
trajectory_data = SequenceData(
|
||||
full_text=f"{prompt_text}{final_response}",
|
||||
tokens=tokens,
|
||||
masked_tokens=masked_tokens,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
return AgentResult(
|
||||
success=False,
|
||||
final_response=final_response,
|
||||
steps=steps,
|
||||
error=f"Reached maximum steps ({self.config.max_steps})",
|
||||
trajectory_data=trajectory_data,
|
||||
)
|
||||
|
||||
# Build result with trajectory data
|
||||
|
|
|
|||
|
|
@ -362,10 +362,37 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
|
|||
flush=True,
|
||||
)
|
||||
if not result.success or result.trajectory_data is None:
|
||||
return None, []
|
||||
# Do not trigger BaseEnv retries for agent failures.
|
||||
# Record the trajectory with score 0.0 so training/eval can see the failure mode.
|
||||
messages = [{"role": "system", "content": agent._build_system_prompt()}] # noqa: SLF001
|
||||
messages.append({"role": "user", "content": task})
|
||||
for step in result.steps:
|
||||
messages.append({"role": "assistant", "content": step.assistant_message})
|
||||
if step.tool_results:
|
||||
tool_text = "\n".join(r.to_xml() for r in step.tool_results)
|
||||
messages.append({"role": "user", "content": tool_text})
|
||||
|
||||
scored: ScoredDataItem = {
|
||||
"tokens": (result.trajectory_data.tokens if result.trajectory_data else []),
|
||||
"masks": (result.trajectory_data.masked_tokens if result.trajectory_data else []),
|
||||
"scores": 0.0,
|
||||
}
|
||||
if self.config.include_messages:
|
||||
# Record a final failure marker as a user-side tool_response-like block so it survives templates.
|
||||
import json
|
||||
|
||||
err = result.error or "agent_failed"
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"<tool_response>{json.dumps({'success': False, 'error': err})}</tool_response>",
|
||||
}
|
||||
)
|
||||
scored["messages"] = messages
|
||||
return scored, []
|
||||
|
||||
print(f"[AgentEnv] tid={trajectory_id} verify_and_score_trajectory() start", flush=True)
|
||||
score, _score_metadata = await self.verify_and_score_trajectory(
|
||||
score, score_metadata = await self.verify_and_score_trajectory(
|
||||
item,
|
||||
result.final_response,
|
||||
trajectory_id=trajectory_id,
|
||||
|
|
@ -387,6 +414,14 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
|
|||
tool_text = "\n".join(r.to_xml() for r in step.tool_results)
|
||||
messages.append({"role": "user", "content": tool_text})
|
||||
|
||||
# Optional: allow env verification to attach additional messages (e.g. install logs).
|
||||
if self.config.include_messages and isinstance(score_metadata, dict):
|
||||
extra = score_metadata.get("verification_messages")
|
||||
if isinstance(extra, list):
|
||||
for m in extra:
|
||||
if isinstance(m, dict) and isinstance(m.get("role"), str) and isinstance(m.get("content"), str):
|
||||
messages.append({"role": m["role"], "content": m["content"]})
|
||||
|
||||
scored: ScoredDataItem = {
|
||||
"tokens": result.trajectory_data.tokens,
|
||||
"masks": result.trajectory_data.masked_tokens,
|
||||
|
|
|
|||
|
|
@ -234,6 +234,8 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
|||
"Constraints:\n"
|
||||
"- Use a workspace-local virtualenv (e.g. inside the repo at ./.venv) to avoid cross-run contamination.\n"
|
||||
"- Use non-interactive commands only.\n\n"
|
||||
"- Terminal commands run under POSIX /bin/sh and each tool call runs in a fresh shell (no persisted env vars).\n"
|
||||
" Avoid bash-only `source`; prefer `. .venv/bin/activate` or `.venv/bin/python ...`.\n\n"
|
||||
f"{verify_note}\n"
|
||||
f"{trunc_note}\n"
|
||||
"Problem statement:\n"
|
||||
|
|
@ -365,13 +367,20 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
|||
*,
|
||||
trajectory_id: str,
|
||||
exec_tool,
|
||||
agent_result=None, # noqa: ARG002
|
||||
agent_result=None,
|
||||
workspace_meta: Optional[Dict[str, Any]] = None,
|
||||
) -> tuple[float, Dict[str, Any]]:
|
||||
_ = trajectory_id
|
||||
repo_dir = self._repo_name(item)
|
||||
|
||||
if self.config.verification_mode == "install":
|
||||
# Training correctness: do not reward trajectories that never actually used tools.
|
||||
if agent_result is not None and getattr(agent_result, "total_tool_calls", 0) <= 0:
|
||||
return 0.0, {
|
||||
"verification_mode": "install",
|
||||
"error": "No tool calls were made by the agent",
|
||||
}
|
||||
|
||||
print(f"[SweSmithOracleEnv] tid={trajectory_id} verify (install): running pip install -e .", flush=True)
|
||||
t0 = time.perf_counter()
|
||||
install_cmd = (
|
||||
|
|
@ -394,6 +403,14 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
|||
"verification_mode": "install",
|
||||
"install_success": ok,
|
||||
"error": res.error,
|
||||
"verification_messages": [{"role": "user", "content": res.to_xml()}],
|
||||
}
|
||||
|
||||
# Training correctness: do not reward trajectories that never actually used tools.
|
||||
if agent_result is not None and getattr(agent_result, "total_tool_calls", 0) <= 0:
|
||||
return 0.0, {
|
||||
"verification_mode": "pytest",
|
||||
"error": "No tool calls were made by the agent",
|
||||
}
|
||||
|
||||
nodeids = self._tests_for_item(item)
|
||||
|
|
@ -412,12 +429,14 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
|||
setup_res = await exec_tool(
|
||||
ToolCall(name="terminal", arguments={"command": setup_cmd, "timeout": self.config.install_timeout_s})
|
||||
)
|
||||
verification_messages = [{"role": "user", "content": setup_res.to_xml()}]
|
||||
if not setup_res.success:
|
||||
return 0.0, {
|
||||
"verification_mode": "pytest",
|
||||
"phase": "install",
|
||||
"error": setup_res.error,
|
||||
"output": setup_res.output,
|
||||
"verification_messages": verification_messages,
|
||||
}
|
||||
|
||||
chunks = self._chunk_nodeids(nodeids, max_per_chunk=50)
|
||||
|
|
@ -430,10 +449,18 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
|
|||
arguments={"command": cmd, "timeout": self.config.test_timeout_s},
|
||||
)
|
||||
)
|
||||
verification_messages.append({"role": "user", "content": res.to_xml()})
|
||||
if not res.success:
|
||||
return 0.0, {"failed_chunk": chunk_idx, "error": res.error, "output": res.output}
|
||||
return 0.0, {
|
||||
"verification_mode": "pytest",
|
||||
"phase": "pytest",
|
||||
"failed_chunk": chunk_idx,
|
||||
"error": res.error,
|
||||
"output": res.output,
|
||||
"verification_messages": verification_messages,
|
||||
}
|
||||
|
||||
return 1.0, {"verification_mode": "pytest", "passed": True}
|
||||
return 1.0, {"verification_mode": "pytest", "passed": True, "verification_messages": verification_messages}
|
||||
|
||||
async def score_trajectory(self, item: Item, final_response: str) -> float:
|
||||
# Not used; scoring happens in verify_and_score_trajectory.
|
||||
|
|
|
|||
|
|
@ -23,7 +23,9 @@ class TerminalTool(Tool):
|
|||
description=(
|
||||
"Execute a command inside the sandbox slot workspace and return stdout/stderr. "
|
||||
"Filesystem persists within a trajectory slot. Background processes are not supported "
|
||||
"in stateless mode."
|
||||
"in stateless mode. Commands run under POSIX /bin/sh and each tool call runs in a fresh "
|
||||
"shell (no persisted env vars). Avoid bash-only syntax like `source`; prefer `. .venv/bin/activate` "
|
||||
"or invoke `.venv/bin/python ...` directly."
|
||||
),
|
||||
parameters={
|
||||
"command": {"type": "string", "description": "The command to execute"},
|
||||
|
|
@ -95,4 +97,3 @@ class WriteFileTool(Tool):
|
|||
|
||||
async def execute(self, **_kwargs) -> ToolResult:
|
||||
return ToolResult(success=False, error="write_file must be executed via ToolExecutor inside the sandbox")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue