mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-29 01:31:41 +00:00
Fix variable name breakage (run_agent, hermes_constants, etc.) where import rewriter changed 'import X' to 'import hermes_agent.Y' but test code still referenced 'X' as a variable name. Fix package-vs-module confusion (cli.auth, cli.models, cli.ui) where single files became directories. Fix hardcoded file paths in tests pointing to old locations. Fix tool registry to discover tools in subpackage directories. Fix stale import in hermes_agent/tools/__init__.py. Part of #14182, #14183
549 lines
18 KiB
Python
549 lines
18 KiB
Python
"""Integration tests for HermesAgentLoop tool calling.
|
|
|
|
Tests the full agent loop with real LLM calls via OpenRouter.
|
|
Uses stepfun/step-3.5-flash:free by default (zero cost), falls back
|
|
to anthropic/claude-sonnet-4 if the free model is unavailable.
|
|
|
|
These tests verify:
|
|
1. Single tool call: model calls a tool, gets result, responds
|
|
2. Multi-tool call: model calls multiple tools in one turn
|
|
3. Multi-turn: model calls tools across multiple turns
|
|
4. Unknown tool rejection: model calling a non-existent tool gets an error
|
|
5. Max turns: loop stops when max_turns is reached
|
|
6. No tools: model responds without calling any tools
|
|
7. Tool error handling: tool execution errors are captured
|
|
|
|
Run:
|
|
pytest tests/test_agent_loop_tool_calling.py -v
|
|
pytest tests/test_agent_loop_tool_calling.py -v -k "single" # run one test
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Set
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
# pytestmark removed — tests skip gracefully via OPENROUTER_API_KEY check on line 59
|
|
|
|
# Ensure repo root is importable
|
|
_repo_root = Path(__file__).resolve().parent.parent.parent
|
|
try:
|
|
from environments.agent_loop import AgentResult, HermesAgentLoop
|
|
from atroposlib.envs.server_handling.openai_server import OpenAIServer # noqa: F401
|
|
except ImportError:
|
|
pytest.skip("atroposlib not installed", allow_module_level=True)
|
|
|
|
|
|
# =========================================================================
|
|
# Test infrastructure
|
|
# =========================================================================
|
|
|
|
# Models to try, in order of preference (free first)
|
|
_MODELS = [
|
|
"stepfun/step-3.5-flash:free",
|
|
"google/gemini-2.0-flash-001",
|
|
"anthropic/claude-sonnet-4",
|
|
]
|
|
|
|
def _get_api_key():
|
|
key = os.getenv("OPENROUTER_API_KEY", "")
|
|
if not key:
|
|
pytest.skip("OPENROUTER_API_KEY not set")
|
|
return key
|
|
|
|
|
|
def _make_server(model: str = None):
|
|
"""Create an OpenAI server for testing."""
|
|
from atroposlib.envs.server_handling.openai_server import OpenAIServer
|
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
|
|
|
config = APIServerConfig(
|
|
base_url="https://openrouter.ai/api/v1",
|
|
model_name=model or _MODELS[0],
|
|
server_type="openai",
|
|
api_key=_get_api_key(),
|
|
health_check=False,
|
|
)
|
|
return OpenAIServer(config)
|
|
|
|
|
|
async def _try_models(test_fn):
|
|
"""Try running a test with each model until one works."""
|
|
last_error = None
|
|
for model in _MODELS:
|
|
try:
|
|
server = _make_server(model)
|
|
return await test_fn(server, model)
|
|
except Exception as e:
|
|
last_error = e
|
|
if "rate" in str(e).lower() or "limit" in str(e).lower():
|
|
continue # Rate limited, try next model
|
|
raise # Real error
|
|
pytest.skip(f"All models failed. Last error: {last_error}")
|
|
|
|
|
|
# =========================================================================
|
|
# Fake tools for testing
|
|
# =========================================================================
|
|
|
|
# Simple calculator tool
|
|
CALC_TOOL = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "calculate",
|
|
"description": "Calculate a math expression. Returns the numeric result.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"expression": {
|
|
"type": "string",
|
|
"description": "Math expression to evaluate, e.g. '2 + 3'"
|
|
}
|
|
},
|
|
"required": ["expression"],
|
|
},
|
|
},
|
|
}
|
|
|
|
# Weather lookup tool
|
|
WEATHER_TOOL = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_weather",
|
|
"description": "Get the current weather for a city. Returns temperature and conditions.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"city": {
|
|
"type": "string",
|
|
"description": "City name, e.g. 'Tokyo'"
|
|
}
|
|
},
|
|
"required": ["city"],
|
|
},
|
|
},
|
|
}
|
|
|
|
# Lookup tool (always succeeds)
|
|
LOOKUP_TOOL = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "lookup",
|
|
"description": "Look up a fact. Returns a short answer string.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "What to look up"
|
|
}
|
|
},
|
|
"required": ["query"],
|
|
},
|
|
},
|
|
}
|
|
|
|
# Error tool (always fails)
|
|
ERROR_TOOL = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "failing_tool",
|
|
"description": "A tool that always fails with an error.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"input": {"type": "string"}
|
|
},
|
|
"required": ["input"],
|
|
},
|
|
},
|
|
}
|
|
|
|
|
|
def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str:
|
|
"""Handle fake tool calls for testing."""
|
|
if tool_name == "calculate":
|
|
expr = args.get("expression", "0")
|
|
try:
|
|
# Safe eval for simple math
|
|
result = eval(expr, {"__builtins__": {}}, {})
|
|
return json.dumps({"result": result})
|
|
except Exception as e:
|
|
return json.dumps({"error": str(e)})
|
|
|
|
elif tool_name == "get_weather":
|
|
city = args.get("city", "Unknown")
|
|
# Return canned weather
|
|
return json.dumps({
|
|
"city": city,
|
|
"temperature": 22,
|
|
"conditions": "sunny",
|
|
"humidity": 45,
|
|
})
|
|
|
|
elif tool_name == "lookup":
|
|
query = args.get("query", "")
|
|
return json.dumps({"answer": f"The answer to '{query}' is 42."})
|
|
|
|
elif tool_name == "failing_tool":
|
|
raise RuntimeError("This tool always fails!")
|
|
|
|
return json.dumps({"error": f"Unknown tool: {tool_name}"})
|
|
|
|
|
|
# =========================================================================
|
|
# Tests
|
|
# =========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_single_tool_call():
|
|
"""Model should call a single tool, get the result, and respond."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[WEATHER_TOOL],
|
|
valid_tool_names={"get_weather"},
|
|
max_turns=5,
|
|
temperature=0.0,
|
|
max_tokens=500,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
assert isinstance(result, AgentResult)
|
|
assert result.turns_used >= 2, f"Expected at least 2 turns (tool call + response), got {result.turns_used}"
|
|
|
|
# Verify a tool call happened
|
|
tool_calls_found = False
|
|
for msg in result.messages:
|
|
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
for tc in msg["tool_calls"]:
|
|
if tc["function"]["name"] == "get_weather":
|
|
tool_calls_found = True
|
|
args = json.loads(tc["function"]["arguments"])
|
|
assert "city" in args
|
|
assert tool_calls_found, "Model should have called get_weather"
|
|
|
|
# Verify tool result is in conversation
|
|
tool_results = [m for m in result.messages if m.get("role") == "tool"]
|
|
assert len(tool_results) >= 1, "Should have at least one tool result"
|
|
|
|
# Verify the final response references the weather
|
|
final_msg = result.messages[-1]
|
|
assert final_msg["role"] == "assistant"
|
|
assert final_msg["content"], "Final response should have content"
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multi_tool_single_turn():
|
|
"""Model should call multiple tools in a single turn."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[WEATHER_TOOL, CALC_TOOL],
|
|
valid_tool_names={"get_weather", "calculate"},
|
|
max_turns=5,
|
|
temperature=0.0,
|
|
max_tokens=500,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": (
|
|
"I need two things at once: "
|
|
"1) What's the weather in Paris? Use get_weather. "
|
|
"2) What is 15 * 7? Use calculate. "
|
|
"Call BOTH tools in a single response."
|
|
)},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
# Count distinct tools called
|
|
tools_called = set()
|
|
for msg in result.messages:
|
|
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
for tc in msg["tool_calls"]:
|
|
tools_called.add(tc["function"]["name"])
|
|
|
|
# At minimum, both tools should have been called (maybe in different turns)
|
|
assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}"
|
|
assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multi_turn_conversation():
|
|
"""Agent should handle multiple turns of tool calls."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[LOOKUP_TOOL, CALC_TOOL],
|
|
valid_tool_names={"lookup", "calculate"},
|
|
max_turns=10,
|
|
temperature=0.0,
|
|
max_tokens=500,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": (
|
|
"First, use the lookup tool to look up 'meaning of life'. "
|
|
"Then use calculate to compute 6 * 7. "
|
|
"Do these in separate tool calls, one at a time."
|
|
)},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
# Should have used both tools
|
|
tools_called = set()
|
|
for msg in result.messages:
|
|
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
for tc in msg["tool_calls"]:
|
|
tools_called.add(tc["function"]["name"])
|
|
|
|
assert "lookup" in tools_called, f"lookup not called. Called: {tools_called}"
|
|
assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}"
|
|
|
|
# Should finish naturally
|
|
assert result.finished_naturally, "Should finish naturally after answering"
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_unknown_tool_rejected():
|
|
"""If the model calls a tool not in valid_tool_names, it gets an error."""
|
|
|
|
async def _run(server, model):
|
|
# Only allow "calculate" but give schema for both
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[CALC_TOOL, WEATHER_TOOL],
|
|
valid_tool_names={"calculate"}, # weather NOT allowed
|
|
max_turns=5,
|
|
temperature=0.0,
|
|
max_tokens=500,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "What's the weather in London? Use get_weather."},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
# Check if get_weather was called and rejected
|
|
if result.tool_errors:
|
|
weather_errors = [e for e in result.tool_errors if e.tool_name == "get_weather"]
|
|
assert len(weather_errors) > 0, "get_weather should have been rejected"
|
|
assert "Unknown tool" in weather_errors[0].error
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_max_turns_limit():
|
|
"""Agent should stop after max_turns even if model keeps calling tools."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[LOOKUP_TOOL],
|
|
valid_tool_names={"lookup"},
|
|
max_turns=2, # Very low limit
|
|
temperature=0.0,
|
|
max_tokens=500,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": (
|
|
"Keep looking up facts. Look up 'fact 1', then 'fact 2', "
|
|
"then 'fact 3', then 'fact 4'. Do them one at a time."
|
|
)},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
assert result.turns_used <= 2, f"Should stop at max_turns=2, used {result.turns_used}"
|
|
assert not result.finished_naturally, "Should NOT finish naturally (hit max_turns)"
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_tools_direct_response():
|
|
"""When no tools are useful, model should respond directly."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[WEATHER_TOOL],
|
|
valid_tool_names={"get_weather"},
|
|
max_turns=5,
|
|
temperature=0.0,
|
|
max_tokens=200,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "What is 2 + 2? Just answer directly, no tools needed."},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
assert result.finished_naturally, "Should finish naturally with a direct response"
|
|
assert result.turns_used == 1, f"Should take exactly 1 turn for a direct answer, took {result.turns_used}"
|
|
|
|
final = result.messages[-1]
|
|
assert final["role"] == "assistant"
|
|
assert final["content"], "Should have text content"
|
|
assert "4" in final["content"], "Should contain the answer '4'"
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_error_handling():
|
|
"""Tool execution errors should be captured and reported to the model."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[ERROR_TOOL],
|
|
valid_tool_names={"failing_tool"},
|
|
max_turns=5,
|
|
temperature=0.0,
|
|
max_tokens=500,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "Please call the failing_tool with input 'test'."},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
# The tool error should be recorded
|
|
assert len(result.tool_errors) >= 1, "Should have at least one tool error"
|
|
assert "RuntimeError" in result.tool_errors[0].error or "always fails" in result.tool_errors[0].error
|
|
|
|
# The error should be in the conversation as a tool result
|
|
tool_results = [m for m in result.messages if m.get("role") == "tool"]
|
|
assert len(tool_results) >= 1
|
|
error_result = json.loads(tool_results[0]["content"])
|
|
assert "error" in error_result
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_agent_result_structure():
|
|
"""Verify the AgentResult has all expected fields populated."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[CALC_TOOL],
|
|
valid_tool_names={"calculate"},
|
|
max_turns=5,
|
|
temperature=0.0,
|
|
max_tokens=300,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "What is 3 + 4? Use the calculate tool."},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
# Structural checks
|
|
assert isinstance(result, AgentResult)
|
|
assert isinstance(result.messages, list)
|
|
assert len(result.messages) >= 3, "Should have user + assistant(tool) + tool_result + assistant(final)"
|
|
assert isinstance(result.turns_used, int)
|
|
assert result.turns_used > 0
|
|
assert isinstance(result.finished_naturally, bool)
|
|
assert isinstance(result.tool_errors, list)
|
|
assert isinstance(result.reasoning_per_turn, list)
|
|
|
|
# Messages should follow OpenAI format
|
|
for msg in result.messages:
|
|
assert "role" in msg, f"Message missing 'role': {msg}"
|
|
assert msg["role"] in ("system", "user", "assistant", "tool"), f"Invalid role: {msg['role']}"
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_conversation_history_preserved():
|
|
"""The full conversation history should be in result.messages."""
|
|
|
|
async def _run(server, model):
|
|
agent = HermesAgentLoop(
|
|
server=server,
|
|
tool_schemas=[WEATHER_TOOL],
|
|
valid_tool_names={"get_weather"},
|
|
max_turns=5,
|
|
temperature=0.0,
|
|
max_tokens=500,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": "You are a helpful weather assistant."},
|
|
{"role": "user", "content": "What's the weather in Berlin? Use get_weather."},
|
|
]
|
|
|
|
with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler):
|
|
result = await agent.run(messages)
|
|
|
|
# System message should be preserved
|
|
assert result.messages[0]["role"] == "system"
|
|
assert "weather assistant" in result.messages[0]["content"]
|
|
|
|
# User message should be preserved
|
|
assert result.messages[1]["role"] == "user"
|
|
assert "Berlin" in result.messages[1]["content"]
|
|
|
|
# Should have assistant + tool + assistant sequence
|
|
roles = [m["role"] for m in result.messages]
|
|
assert "tool" in roles, "Should have tool results in conversation"
|
|
|
|
return result
|
|
|
|
await _try_models(_run)
|