hermes-agent/tests/run_agent/test_repair_tool_call_name.py
Martín Alcalá Rubí 132d6fe6d6 fix(volcengine): strip XML attribute fragments from tool_use.name (#33007)
VolcEngine's api/plan endpoint occasionally leaks raw XML attribute
fragments into tool_use.name when its protocol-translation layer
converts the model's native XML-style tool emission to Anthropic
Messages tool_use blocks, producing names like:

  terminal" parameter="command" string="true
  execute_code" parameter="code" string="true
  session_search" parameter="session_id" string="true

The corruption happens server-side at the provider, but it breaks
every tool call for affected users — no normalization rule in
repair_tool_call can rescue them, so each request runs through three
retries and then aborts as partial.

Add an early sanitizer in agent_runtime_helpers.repair_tool_call that
trims at the first ' " ', " ' ", '<', or '>' character (idx > 0
only) so the rest of the existing repair pipeline (lowercase /
snake_case / fuzzy match) can resolve the cleaned name normally.

Whitespace is deliberately NOT a separator — the legitimate
"write file" -> write_file repair path (covered by
test_space_to_underscore) must keep working.

Tests: 11 new regression cases in TestVolcEngineXmlPollution
covering all three observed polluted names, CamelCase + pollution
mix, single-quote variants, angle-bracket variants, clean-name
passthrough, and the whitespace-preservation guard. All 18 pre-
existing repair tests still pass (29 total in the file).
2026-06-07 22:22:01 -07:00

188 lines
7.2 KiB
Python

"""Tests for AIAgent._repair_tool_call — tool-name normalization.
Regression guard for #14784: Claude-style models sometimes emit
class-like tool-call names (``TodoTool_tool``, ``Patch_tool``,
``BrowserClick_tool``, ``PatchTool``). Before the fix they returned
"Unknown tool" even though the target tool was registered under a
snake_case name. The repair routine now normalizes CamelCase,
strips trailing ``_tool`` / ``-tool`` / ``tool`` suffixes (up to
twice to handle double-tacked suffixes like ``TodoTool_tool``), and
falls back to fuzzy match.
"""
from __future__ import annotations
from types import SimpleNamespace
import pytest
VALID = {
"todo",
"patch",
"browser_click",
"browser_navigate",
"web_search",
"read_file",
"write_file",
"terminal",
"execute_code",
"session_search",
}
@pytest.fixture
def repair():
"""Return a bound _repair_tool_call built on a minimal shell agent.
We avoid constructing a real AIAgent (which pulls in credential
resolution, session DB, etc.) because the repair routine only
reads self.valid_tool_names. A SimpleNamespace stub is enough to
bind the unbound function.
"""
from run_agent import AIAgent
stub = SimpleNamespace(valid_tool_names=VALID)
return AIAgent._repair_tool_call.__get__(stub, AIAgent)
class TestExistingBehaviorStillWorks:
"""Pre-existing repairs must keep working (no regressions)."""
def test_lowercase_already_matches(self, repair):
assert repair("browser_click") == "browser_click"
def test_uppercase_simple(self, repair):
assert repair("TERMINAL") == "terminal"
def test_dash_to_underscore(self, repair):
assert repair("web-search") == "web_search"
def test_space_to_underscore(self, repair):
assert repair("write file") == "write_file"
def test_fuzzy_near_miss(self, repair):
# One-character typo — fuzzy match at 0.7 cutoff
assert repair("terminall") == "terminal"
def test_unknown_returns_none(self, repair):
assert repair("xyz_no_such_tool") is None
class TestClassLikeEmissions:
"""Regression coverage for #14784 — CamelCase + _tool suffix variants."""
def test_camel_case_no_suffix(self, repair):
assert repair("BrowserClick") == "browser_click"
def test_camel_case_with_underscore_tool_suffix(self, repair):
assert repair("BrowserClick_tool") == "browser_click"
def test_camel_case_with_Tool_class_suffix(self, repair):
assert repair("PatchTool") == "patch"
def test_double_tacked_class_and_snake_suffix(self, repair):
# Hardest case from the report: TodoTool_tool — strip both
# '_tool' (trailing) and 'Tool' (CamelCase embedded) to reach 'todo'.
assert repair("TodoTool_tool") == "todo"
def test_simple_name_with_tool_suffix(self, repair):
assert repair("Patch_tool") == "patch"
def test_simple_name_with_dash_tool_suffix(self, repair):
assert repair("patch-tool") == "patch"
def test_camel_case_preserves_multi_word_match(self, repair):
assert repair("ReadFile_tool") == "read_file"
assert repair("WriteFileTool") == "write_file"
def test_mixed_separators_and_suffix(self, repair):
assert repair("write-file_Tool") == "write_file"
class TestEdgeCases:
"""Edge inputs that must not crash or produce surprising results."""
def test_empty_string(self, repair):
assert repair("") is None
def test_only_tool_suffix(self, repair):
# '_tool' by itself is not a valid tool name — must not match
# anything plausible.
assert repair("_tool") is None
def test_none_passed_as_name(self, repair):
# Defensive: real callers always pass str, but guard against
# a bug upstream that sends None.
assert repair(None) is None
def test_very_long_name_does_not_match_by_accident(self, repair):
# Fuzzy match should not claim a tool for something obviously unrelated.
assert repair("ThisIsNotRemotelyARealToolName_tool") is None
class TestVolcEngineXmlPollution:
"""Regression coverage for #33007 — VolcEngine ``api/plan`` endpoint
leaks raw XML attribute fragments into ``tool_use.name``.
Observed in production with the ``anthropic_messages`` API mode:
terminal" parameter="command" string="true
execute_code" parameter="code" string="true
session_search" parameter="session_id" string="true
The fix trims at the first ``"``/``'``/``<``/``>`` so the rest of
the repair pipeline can resolve the cleaned name to a real tool.
"""
def test_terminal_with_xml_attribute_pollution(self, repair):
# Exact pattern from the bug report (terminal call).
polluted = 'terminal" parameter="command" string="true'
assert repair(polluted) == "terminal"
def test_execute_code_with_xml_attribute_pollution(self, repair):
polluted = 'execute_code" parameter="code" string="true'
assert repair(polluted) == "execute_code"
def test_session_search_with_xml_attribute_pollution(self, repair):
polluted = 'session_search" parameter="session_id" string="true'
assert repair(polluted) == "session_search"
def test_camel_case_tool_with_xml_pollution(self, repair):
# If the polluted prefix is CamelCase / suffixed, the rest of
# the pipeline (CamelCase -> snake_case, _tool strip) still runs.
polluted = 'BrowserClick_tool" parameter="selector" string="true'
assert repair(polluted) == "browser_click"
def test_tool_name_with_trailing_quote_only(self, repair):
# Minimal leak — just a stray trailing quote, no full attribute.
assert repair('terminal"') == "terminal"
def test_tool_name_with_angle_bracket_pollution(self, repair):
# Defensive — same root cause, raw '<' bleeding through.
assert repair("terminal<parameter=command") == "terminal"
def test_tool_name_with_single_quote_pollution(self, repair):
# Defensive — same root cause, single-quoted attribute style.
assert repair("terminal' parameter='command' string='true") == "terminal"
def test_clean_tool_name_unaffected_by_sanitizer(self, repair):
# Pure passthrough — no XML/quote chars, no change.
assert repair("execute_code") == "execute_code"
assert repair("session_search") == "session_search"
def test_space_separated_name_still_normalizes(self, repair):
# Critical: the XML strip must NOT consume whitespace, or the
# legitimate ``"write file" -> write_file`` repair path breaks.
assert repair("write file") == "write_file"
def test_pollution_with_unknown_tool_root_still_fails(self, repair):
# Sanitizer must not mask invalid tool names by laundering them
# through the cleaner.
polluted = 'no_such_tool" parameter="x" string="true'
assert repair(polluted) is None
def test_leading_quote_falls_through_to_fuzzy_match(self, repair):
# Sanitizer only trims when the XML char is at idx > 0 — a
# name that *starts* with a quote is left untouched so the
# rest of the pipeline (fuzzy match at 0.7 cutoff) can still
# recover the obvious target.
assert repair('"terminal"') == "terminal"