mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
* chore: remove Atropos RL environments, tools, tests, skill, and tinker-atropos submodule Delete: - environments/ (43 files — base env, agent loop, tool call parsers, benchmarks) - rl_cli.py (standalone RL training CLI) - tools/rl_training_tool.py (all 10 rl_* tools) - tests: test_rl_training_tool, test_tool_call_parsers, test_managed_server_tool_support, test_agent_loop, test_agent_loop_vllm, test_agent_loop_tool_calling, test_terminalbench2_env_security - optional-skills/mlops/hermes-atropos-environments/ - tinker-atropos git submodule + .gitmodules * chore: remove RL/Atropos references from Python source - toolsets.py: remove rl toolset block + update comment - model_tools.py: remove rl_tools group + update async bridging comment - hermes_cli/tools_config.py: remove RL display entry, _DEFAULT_OFF_TOOLSETS, setup block, and rl_training post-setup handler - tools/budget_config.py: remove RL environment reference in docstring - tests/test_model_tools.py: remove rl_tools from expected groups - tests/run_agent/test_streaming_tool_call_repair.py: fix stale cross-reference * chore: remove rl/yc-bench extras and tinker-atropos refs from pyproject.toml - Remove rl extra (atroposlib, tinker, fastapi, uvicorn, wandb) - Remove yc-bench extra - Remove rl_cli from py-modules - Remove [tool.ty.src] exclude for tinker-atropos - Remove [tool.ruff] exclude for tinker-atropos - Regenerate uv.lock * chore: remove tinker-atropos from install/setup scripts - setup-hermes.sh: remove entire tinker-atropos submodule install block - scripts/install.sh: remove both tinker-atropos blocks (Termux + standard) - scripts/install.ps1: remove tinker-atropos block - nix/hermes-agent.nix: remove tinker-atropos pip install line * chore: remove RL references from cli-config.yaml.example * docs: remove Atropos/RL references from README, CONTRIBUTING, AGENTS.md * docs: remove RL/Atropos references from website - Delete: environments.md, rl-training.md, mlops-hermes-atropos-environments.md - sidebars.ts: remove rl-training and environments sidebar entries - optional-skills-catalog.md: remove hermes-atropos-environments row - tools-reference.md: remove entire rl toolset section - toolsets-reference.md: remove rl row + update example - integrations/index.md: remove RL Training bullet - architecture.md: remove environments/ from tree + RL section - contributing.md: remove tinker-atropos setup - updating.md: remove tinker-atropos install + stale submodule update * chore: remove remaining RL/Atropos stragglers - hermes_cli/config.py: remove TINKER_API_KEY + WANDB_API_KEY env var defs - hermes_cli/doctor.py: remove Submodules check section (tinker-atropos) - hermes_cli/setup.py: remove RL Training status check - hermes_cli/status.py: remove Tinker + WandB from API key status display - agent/display.py: remove both rl_* tool preview/activity blocks - website/docs: remove RL references from providers.md + env-variables.md - tests: remove TINKER_API_KEY from conftest, set_config_value, setup_script * chore: remove RL training section from .env.example
360 lines
14 KiB
Python
360 lines
14 KiB
Python
"""Tests for model_tools.py — function call dispatch, agent-loop interception, legacy toolsets."""
|
|
|
|
import json
|
|
from unittest.mock import ANY, call, patch
|
|
|
|
import pytest
|
|
|
|
from model_tools import (
|
|
handle_function_call,
|
|
get_all_tool_names,
|
|
get_toolset_for_tool,
|
|
_AGENT_LOOP_TOOLS,
|
|
_LEGACY_TOOLSET_MAP,
|
|
TOOL_TO_TOOLSET_MAP,
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# handle_function_call
|
|
# =========================================================================
|
|
|
|
class TestHandleFunctionCall:
|
|
def test_agent_loop_tool_returns_error(self):
|
|
for tool_name in _AGENT_LOOP_TOOLS:
|
|
result = json.loads(handle_function_call(tool_name, {}))
|
|
assert "error" in result
|
|
assert "agent loop" in result["error"].lower()
|
|
|
|
def test_unknown_tool_returns_error(self):
|
|
result = json.loads(handle_function_call("totally_fake_tool_xyz", {}))
|
|
assert "error" in result
|
|
assert "totally_fake_tool_xyz" in result["error"]
|
|
|
|
def test_exception_returns_json_error(self):
|
|
# Even if something goes wrong, should return valid JSON
|
|
result = handle_function_call("web_search", None) # None args may cause issues
|
|
parsed = json.loads(result)
|
|
assert isinstance(parsed, dict)
|
|
assert "error" in parsed
|
|
assert len(parsed["error"]) > 0
|
|
assert "error" in parsed["error"].lower() or "failed" in parsed["error"].lower()
|
|
|
|
def test_tool_hooks_receive_session_and_tool_call_ids(self):
|
|
with (
|
|
patch("model_tools.registry.dispatch", return_value='{"ok":true}'),
|
|
patch("hermes_cli.plugins.invoke_hook") as mock_invoke_hook,
|
|
):
|
|
result = handle_function_call(
|
|
"web_search",
|
|
{"q": "test"},
|
|
task_id="task-1",
|
|
tool_call_id="call-1",
|
|
session_id="session-1",
|
|
)
|
|
|
|
assert result == '{"ok":true}'
|
|
assert mock_invoke_hook.call_args_list == [
|
|
call(
|
|
"pre_tool_call",
|
|
tool_name="web_search",
|
|
args={"q": "test"},
|
|
task_id="task-1",
|
|
session_id="session-1",
|
|
tool_call_id="call-1",
|
|
),
|
|
call(
|
|
"post_tool_call",
|
|
tool_name="web_search",
|
|
args={"q": "test"},
|
|
result='{"ok":true}',
|
|
task_id="task-1",
|
|
session_id="session-1",
|
|
tool_call_id="call-1",
|
|
duration_ms=ANY,
|
|
),
|
|
call(
|
|
"transform_tool_result",
|
|
tool_name="web_search",
|
|
args={"q": "test"},
|
|
result='{"ok":true}',
|
|
task_id="task-1",
|
|
session_id="session-1",
|
|
tool_call_id="call-1",
|
|
duration_ms=ANY,
|
|
),
|
|
]
|
|
|
|
def test_post_tool_call_receives_non_negative_integer_duration_ms(self):
|
|
"""Regression: post_tool_call and transform_tool_result hooks must
|
|
receive a non-negative integer ``duration_ms`` kwarg measuring
|
|
dispatch latency. Inspired by Claude Code 2.1.119, which added
|
|
``duration_ms`` to its PostToolUse hook inputs.
|
|
"""
|
|
with (
|
|
patch("model_tools.registry.dispatch", return_value='{"ok":true}'),
|
|
patch("hermes_cli.plugins.invoke_hook") as mock_invoke_hook,
|
|
):
|
|
handle_function_call("web_search", {"q": "test"}, task_id="t1")
|
|
|
|
kwargs_by_hook = {
|
|
c.args[0]: c.kwargs for c in mock_invoke_hook.call_args_list
|
|
}
|
|
assert "duration_ms" in kwargs_by_hook["post_tool_call"]
|
|
assert "duration_ms" in kwargs_by_hook["transform_tool_result"]
|
|
|
|
post_duration = kwargs_by_hook["post_tool_call"]["duration_ms"]
|
|
transform_duration = kwargs_by_hook["transform_tool_result"]["duration_ms"]
|
|
assert isinstance(post_duration, int)
|
|
assert post_duration >= 0
|
|
# Both hooks should observe the same measured duration.
|
|
assert post_duration == transform_duration
|
|
# pre_tool_call does NOT get duration_ms (nothing has run yet).
|
|
assert "duration_ms" not in kwargs_by_hook["pre_tool_call"]
|
|
|
|
|
|
# =========================================================================
|
|
# Agent loop tools
|
|
# =========================================================================
|
|
|
|
class TestAgentLoopTools:
|
|
def test_expected_tools_in_set(self):
|
|
assert "todo" in _AGENT_LOOP_TOOLS
|
|
assert "memory" in _AGENT_LOOP_TOOLS
|
|
assert "session_search" in _AGENT_LOOP_TOOLS
|
|
assert "delegate_task" in _AGENT_LOOP_TOOLS
|
|
|
|
def test_no_regular_tools_in_set(self):
|
|
assert "web_search" not in _AGENT_LOOP_TOOLS
|
|
assert "terminal" not in _AGENT_LOOP_TOOLS
|
|
|
|
|
|
# =========================================================================
|
|
# Pre-tool-call blocking via plugin hooks
|
|
# =========================================================================
|
|
|
|
class TestPreToolCallBlocking:
|
|
"""Verify that pre_tool_call hooks can block tool execution."""
|
|
|
|
def test_blocked_tool_returns_error_and_skips_dispatch(self, monkeypatch):
|
|
def fake_invoke_hook(hook_name, **kwargs):
|
|
if hook_name == "pre_tool_call":
|
|
return [{"action": "block", "message": "Blocked by policy"}]
|
|
return []
|
|
|
|
dispatch_called = False
|
|
_orig_dispatch = None
|
|
|
|
def fake_dispatch(*args, **kwargs):
|
|
nonlocal dispatch_called
|
|
dispatch_called = True
|
|
raise AssertionError("dispatch should not run when blocked")
|
|
|
|
monkeypatch.setattr("hermes_cli.plugins.invoke_hook", fake_invoke_hook)
|
|
monkeypatch.setattr("model_tools.registry.dispatch", fake_dispatch)
|
|
|
|
result = json.loads(handle_function_call("read_file", {"path": "test.txt"}, task_id="t1"))
|
|
assert result == {"error": "Blocked by policy"}
|
|
assert not dispatch_called
|
|
|
|
def test_blocked_tool_skips_read_loop_notification(self, monkeypatch):
|
|
notifications = []
|
|
|
|
def fake_invoke_hook(hook_name, **kwargs):
|
|
if hook_name == "pre_tool_call":
|
|
return [{"action": "block", "message": "Blocked"}]
|
|
return []
|
|
|
|
monkeypatch.setattr("hermes_cli.plugins.invoke_hook", fake_invoke_hook)
|
|
monkeypatch.setattr("model_tools.registry.dispatch",
|
|
lambda *a, **kw: (_ for _ in ()).throw(AssertionError("should not run")))
|
|
monkeypatch.setattr("tools.file_tools.notify_other_tool_call",
|
|
lambda task_id: notifications.append(task_id))
|
|
|
|
result = json.loads(handle_function_call("web_search", {"q": "test"}, task_id="t1"))
|
|
assert result == {"error": "Blocked"}
|
|
assert notifications == []
|
|
|
|
def test_invalid_hook_returns_do_not_block(self, monkeypatch):
|
|
"""Malformed hook returns should be ignored — tool executes normally."""
|
|
def fake_invoke_hook(hook_name, **kwargs):
|
|
if hook_name == "pre_tool_call":
|
|
return [
|
|
"block",
|
|
{"action": "block"}, # missing message
|
|
{"action": "deny", "message": "nope"},
|
|
]
|
|
return []
|
|
|
|
monkeypatch.setattr("hermes_cli.plugins.invoke_hook", fake_invoke_hook)
|
|
monkeypatch.setattr("model_tools.registry.dispatch",
|
|
lambda *a, **kw: json.dumps({"ok": True}))
|
|
|
|
result = json.loads(handle_function_call("read_file", {"path": "test.txt"}, task_id="t1"))
|
|
assert result == {"ok": True}
|
|
|
|
def test_skip_flag_prevents_double_fire(self, monkeypatch):
|
|
"""When skip_pre_tool_call_hook=True, the hook does not fire again.
|
|
|
|
The caller (e.g. run_agent._invoke_tool) has already called
|
|
get_pre_tool_call_block_message(), which fires the hook once.
|
|
handle_function_call must NOT fire it a second time — that was
|
|
the classic double-fire bug where observer hooks logged every
|
|
tool call twice.
|
|
"""
|
|
hook_calls = []
|
|
|
|
def fake_invoke_hook(hook_name, **kwargs):
|
|
hook_calls.append(hook_name)
|
|
return []
|
|
|
|
monkeypatch.setattr("hermes_cli.plugins.invoke_hook", fake_invoke_hook)
|
|
monkeypatch.setattr("model_tools.registry.dispatch",
|
|
lambda *a, **kw: json.dumps({"ok": True}))
|
|
|
|
handle_function_call("web_search", {"q": "test"}, task_id="t1",
|
|
skip_pre_tool_call_hook=True)
|
|
|
|
# Single-fire contract: when skip=True the caller already fired
|
|
# pre_tool_call, so handle_function_call must not fire it again.
|
|
assert hook_calls.count("pre_tool_call") == 0, (
|
|
f"pre_tool_call fired {hook_calls.count('pre_tool_call')} times "
|
|
f"with skip_pre_tool_call_hook=True; expected 0 "
|
|
f"(caller already fired it). hook_calls={hook_calls}"
|
|
)
|
|
# post_tool_call and transform_tool_result still fire — only the
|
|
# pre-call block-check path is suppressed by the skip flag.
|
|
assert "post_tool_call" in hook_calls
|
|
assert "transform_tool_result" in hook_calls
|
|
|
|
def test_run_agent_pattern_fires_pre_tool_call_exactly_once(self, monkeypatch):
|
|
"""End-to-end regression for the double-fire bug.
|
|
|
|
Mirrors run_agent._invoke_tool: first calls
|
|
get_pre_tool_call_block_message() (which fires the hook as part of
|
|
its block-directive poll), then calls
|
|
handle_function_call(skip_pre_tool_call_hook=True). The plugin
|
|
hook MUST fire exactly once across both calls — not twice as it
|
|
did before the fix (observer plugins were seeing every tool
|
|
execution logged twice).
|
|
"""
|
|
from hermes_cli.plugins import get_pre_tool_call_block_message
|
|
|
|
hook_calls = []
|
|
|
|
def fake_invoke_hook(hook_name, **kwargs):
|
|
hook_calls.append(hook_name)
|
|
return []
|
|
|
|
monkeypatch.setattr("hermes_cli.plugins.invoke_hook", fake_invoke_hook)
|
|
monkeypatch.setattr("model_tools.registry.dispatch",
|
|
lambda *a, **kw: json.dumps({"ok": True}))
|
|
|
|
# Step 1: caller checks for a block directive (this fires pre_tool_call once).
|
|
block = get_pre_tool_call_block_message(
|
|
"web_search", {"q": "test"}, task_id="t1",
|
|
)
|
|
assert block is None
|
|
|
|
# Step 2: caller dispatches with skip=True so the hook isn't re-fired.
|
|
handle_function_call(
|
|
"web_search", {"q": "test"}, task_id="t1",
|
|
skip_pre_tool_call_hook=True,
|
|
)
|
|
|
|
assert hook_calls.count("pre_tool_call") == 1, (
|
|
f"pre_tool_call fired {hook_calls.count('pre_tool_call')} times "
|
|
f"across the run_agent (block-check + dispatch) path; "
|
|
f"expected exactly 1. hook_calls={hook_calls}"
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# Legacy toolset map
|
|
# =========================================================================
|
|
|
|
class TestLegacyToolsetMap:
|
|
def test_expected_legacy_names(self):
|
|
expected = [
|
|
"web_tools", "terminal_tools", "vision_tools", "moa_tools",
|
|
"image_tools", "skills_tools", "browser_tools", "cronjob_tools",
|
|
"file_tools", "tts_tools",
|
|
]
|
|
for name in expected:
|
|
assert name in _LEGACY_TOOLSET_MAP, f"Missing legacy toolset: {name}"
|
|
|
|
def test_values_are_lists_of_strings(self):
|
|
for name, tools in _LEGACY_TOOLSET_MAP.items():
|
|
assert isinstance(tools, list), f"{name} is not a list"
|
|
for tool in tools:
|
|
assert isinstance(tool, str), f"{name} contains non-string: {tool}"
|
|
|
|
|
|
# =========================================================================
|
|
# Backward-compat wrappers
|
|
# =========================================================================
|
|
|
|
class TestBackwardCompat:
|
|
def test_get_all_tool_names_returns_list(self):
|
|
names = get_all_tool_names()
|
|
assert isinstance(names, list)
|
|
assert len(names) > 0
|
|
# Should contain well-known tools
|
|
assert "web_search" in names
|
|
assert "terminal" in names
|
|
|
|
def test_get_toolset_for_tool(self):
|
|
result = get_toolset_for_tool("web_search")
|
|
assert result is not None
|
|
assert isinstance(result, str)
|
|
|
|
def test_get_toolset_for_unknown_tool(self):
|
|
result = get_toolset_for_tool("totally_nonexistent_tool")
|
|
assert result is None
|
|
|
|
def test_tool_to_toolset_map(self):
|
|
assert isinstance(TOOL_TO_TOOLSET_MAP, dict)
|
|
assert len(TOOL_TO_TOOLSET_MAP) > 0
|
|
|
|
|
|
# =========================================================================
|
|
# _coerce_number — inf / nan must fall through to the original string
|
|
# (regression: fix: eliminate duplicate checkpoint entries and JSON-unsafe coercion)
|
|
# =========================================================================
|
|
|
|
class TestCoerceNumberInfNan:
|
|
"""_coerce_number must honor its documented contract ("Returns original
|
|
string on failure") for inf/nan inputs, because float('inf') and
|
|
float('nan') are not JSON-compliant under strict serialization."""
|
|
|
|
def test_inf_returns_original_string(self):
|
|
from model_tools import _coerce_number
|
|
assert _coerce_number("inf") == "inf"
|
|
|
|
def test_negative_inf_returns_original_string(self):
|
|
from model_tools import _coerce_number
|
|
assert _coerce_number("-inf") == "-inf"
|
|
|
|
def test_nan_returns_original_string(self):
|
|
from model_tools import _coerce_number
|
|
assert _coerce_number("nan") == "nan"
|
|
|
|
def test_infinity_spelling_returns_original_string(self):
|
|
from model_tools import _coerce_number
|
|
# Python's float() parses "Infinity" too — still not JSON-safe.
|
|
assert _coerce_number("Infinity") == "Infinity"
|
|
|
|
def test_coerced_result_is_strict_json_safe(self):
|
|
"""Whatever _coerce_number returns for inf/nan must round-trip
|
|
through strict (allow_nan=False) json.dumps without raising."""
|
|
from model_tools import _coerce_number
|
|
for s in ("inf", "-inf", "nan", "Infinity"):
|
|
result = _coerce_number(s)
|
|
json.dumps({"x": result}, allow_nan=False) # must not raise
|
|
|
|
def test_normal_numbers_still_coerce(self):
|
|
"""Guard against over-correction — real numbers still coerce."""
|
|
from model_tools import _coerce_number
|
|
assert _coerce_number("42") == 42
|
|
assert _coerce_number("3.14") == 3.14
|
|
assert _coerce_number("1e3") == 1000
|