diff --git a/agent/tool_guardrails.py b/agent/tool_guardrails.py index c8a7aa009a..b3237bc390 100644 --- a/agent/tool_guardrails.py +++ b/agent/tool_guardrails.py @@ -2,7 +2,8 @@ The controller in this module is intentionally side-effect free: it tracks per-turn tool-call observations and returns decisions. Runtime code owns whether -those decisions become synthetic tool results or controlled turn halts. +those decisions become warning guidance, synthetic tool results, or controlled +turn halts. """ from __future__ import annotations @@ -60,17 +61,67 @@ MUTATING_TOOL_NAMES = frozenset( @dataclass(frozen=True) class ToolCallGuardrailConfig: - """Thresholds for per-turn tool-call loop detection.""" + """Thresholds for per-turn tool-call loop detection. + Warnings are enabled by default and never prevent tool execution. Hard stops + are explicit opt-in so interactive CLI/TUI sessions get a gentle nudge unless + the user enables circuit-breaker behavior in config.yaml. + """ + + warnings_enabled: bool = True + hard_stop_enabled: bool = False exact_failure_warn_after: int = 2 - exact_failure_block_after: int = 2 + exact_failure_block_after: int = 5 same_tool_failure_warn_after: int = 3 - same_tool_failure_halt_after: int = 5 + same_tool_failure_halt_after: int = 8 no_progress_warn_after: int = 2 - no_progress_block_after: int = 2 + no_progress_block_after: int = 5 idempotent_tools: frozenset[str] = field(default_factory=lambda: IDEMPOTENT_TOOL_NAMES) mutating_tools: frozenset[str] = field(default_factory=lambda: MUTATING_TOOL_NAMES) + @classmethod + def from_mapping(cls, data: Mapping[str, Any] | None) -> "ToolCallGuardrailConfig": + """Build config from the `tool_loop_guardrails` config.yaml section.""" + if not isinstance(data, Mapping): + return cls() + + warn_after = data.get("warn_after") + if not isinstance(warn_after, Mapping): + warn_after = {} + hard_stop_after = data.get("hard_stop_after") + if not isinstance(hard_stop_after, Mapping): + hard_stop_after = {} + + defaults = cls() + return cls( + warnings_enabled=_as_bool(data.get("warnings_enabled"), defaults.warnings_enabled), + hard_stop_enabled=_as_bool(data.get("hard_stop_enabled"), defaults.hard_stop_enabled), + exact_failure_warn_after=_positive_int( + warn_after.get("exact_failure", data.get("exact_failure_warn_after")), + defaults.exact_failure_warn_after, + ), + same_tool_failure_warn_after=_positive_int( + warn_after.get("same_tool_failure", data.get("same_tool_failure_warn_after")), + defaults.same_tool_failure_warn_after, + ), + no_progress_warn_after=_positive_int( + warn_after.get("idempotent_no_progress", data.get("no_progress_warn_after")), + defaults.no_progress_warn_after, + ), + exact_failure_block_after=_positive_int( + hard_stop_after.get("exact_failure", data.get("exact_failure_block_after")), + defaults.exact_failure_block_after, + ), + same_tool_failure_halt_after=_positive_int( + hard_stop_after.get("same_tool_failure", data.get("same_tool_failure_halt_after")), + defaults.same_tool_failure_halt_after, + ), + no_progress_block_after=_positive_int( + hard_stop_after.get("idempotent_no_progress", data.get("no_progress_block_after")), + defaults.no_progress_block_after, + ), + ) + @dataclass(frozen=True) class ToolCallSignature: @@ -192,6 +243,8 @@ class ToolCallGuardrailController: def before_call(self, tool_name: str, args: Mapping[str, Any] | None) -> ToolGuardrailDecision: signature = ToolCallSignature.from_call(tool_name, _coerce_args(args)) + if not self.config.hard_stop_enabled: + return ToolGuardrailDecision(tool_name=tool_name, signature=signature) exact_count = self._exact_failure_counts.get(signature, 0) if exact_count >= self.config.exact_failure_block_after: @@ -253,7 +306,7 @@ class ToolCallGuardrailController: same_count = self._same_tool_failure_counts.get(tool_name, 0) + 1 self._same_tool_failure_counts[tool_name] = same_count - if same_count >= self.config.same_tool_failure_halt_after: + if self.config.hard_stop_enabled and same_count >= self.config.same_tool_failure_halt_after: decision = ToolGuardrailDecision( action="halt", code="same_tool_failure_halt", @@ -268,27 +321,27 @@ class ToolCallGuardrailController: self._halt_decision = decision return decision - if exact_count >= self.config.exact_failure_warn_after: + if self.config.warnings_enabled and exact_count >= self.config.exact_failure_warn_after: return ToolGuardrailDecision( action="warn", code="repeated_exact_failure_warning", message=( - f"Tool guardrail: {tool_name} has failed {exact_count} times " - "with identical arguments. Do not retry it unchanged; inspect the " - "error and change strategy." + f"{tool_name} has failed {exact_count} times with identical arguments. " + "This looks like a loop; inspect the error and change strategy " + "instead of retrying it unchanged." ), tool_name=tool_name, count=exact_count, signature=signature, ) - if same_count >= self.config.same_tool_failure_warn_after: + if self.config.warnings_enabled and same_count >= self.config.same_tool_failure_warn_after: return ToolGuardrailDecision( action="warn", code="same_tool_failure_warning", message=( - f"Tool guardrail: {tool_name} has failed {same_count} times " - "this turn. Change approach before retrying." + f"{tool_name} has failed {same_count} times this turn. " + "This looks like a loop; change approach before retrying." ), tool_name=tool_name, count=same_count, @@ -311,14 +364,14 @@ class ToolCallGuardrailController: repeat_count = previous[1] + 1 self._no_progress[signature] = (result_hash, repeat_count) - if repeat_count >= self.config.no_progress_warn_after: + if self.config.warnings_enabled and repeat_count >= self.config.no_progress_warn_after: return ToolGuardrailDecision( action="warn", code="idempotent_no_progress_warning", message=( - f"Tool guardrail: {tool_name} returned the same result " - f"{repeat_count} times. Use the result or change the query instead " - "of repeating it unchanged." + f"{tool_name} returned the same result {repeat_count} times. " + "Use the result already provided or change the query instead of " + "repeating it unchanged." ), tool_name=tool_name, count=repeat_count, @@ -348,8 +401,9 @@ def append_toolguard_guidance(result: str, decision: ToolGuardrailDecision) -> s """Append runtime guidance to the current tool result content.""" if decision.action not in {"warn", "halt"} or not decision.message: return result + label = "Tool loop hard stop" if decision.action == "halt" else "Tool loop warning" suffix = ( - "\n\n[Tool guardrail: " + f"\n\n[{label}: " f"{decision.code}; count={decision.count}; {decision.message}]" ) return (result or "") + suffix @@ -377,5 +431,31 @@ def _result_hash(result: str | None) -> str: return _sha256(canonical) +def _as_bool(value: Any, default: bool) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return bool(value) + if isinstance(value, str): + lowered = value.strip().lower() + if lowered in {"1", "true", "yes", "on", "enabled"}: + return True + if lowered in {"0", "false", "no", "off", "disabled"}: + return False + return default + + +def _positive_int(value: Any, default: int) -> int: + if value is None: + return default + try: + parsed = int(value) + except (TypeError, ValueError): + return default + return parsed if parsed >= 1 else default + + def _sha256(value: str) -> str: return hashlib.sha256(value.encode("utf-8")).hexdigest() diff --git a/cli-config.yaml.example b/cli-config.yaml.example index e292498b0c..c92be7e26b 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -289,6 +289,25 @@ browser: # after this period of no activity between agent loops (default: 120 = 2 minutes) inactivity_timeout: 120 +# ============================================================================= +# Tool Loop Guardrails +# ============================================================================= +# Soft warnings are enabled by default. They append guidance to repeated failed +# or non-progressing tool results but still let the tool execute. Hard stops are +# opt-in circuit breakers for autonomous/cron sessions where stopping a loop is +# preferable to spending the full iteration budget. +tool_loop_guardrails: + warnings_enabled: true + hard_stop_enabled: false + warn_after: + exact_failure: 2 + same_tool_failure: 3 + idempotent_no_progress: 2 + hard_stop_after: + exact_failure: 5 + same_tool_failure: 8 + idempotent_no_progress: 5 + # ============================================================================= # Context Compression (Auto-shrinks long conversations) # ============================================================================= diff --git a/hermes_cli/config.py b/hermes_cli/config.py index e765448b7b..d392467676 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -607,6 +607,24 @@ DEFAULT_CONFIG = { "max_line_length": 2000, }, + # Tool loop guardrails nudge models when they repeat failed or + # non-progressing tool calls. Soft warnings are always-on by default; + # hard stops are opt-in so interactive CLI/TUI sessions keep flowing. + "tool_loop_guardrails": { + "warnings_enabled": True, + "hard_stop_enabled": False, + "warn_after": { + "exact_failure": 2, + "same_tool_failure": 3, + "idempotent_no_progress": 2, + }, + "hard_stop_after": { + "exact_failure": 5, + "same_tool_failure": 8, + "idempotent_no_progress": 5, + }, + }, + "compression": { "enabled": True, "threshold": 0.50, # compress when context usage exceeds this ratio diff --git a/run_agent.py b/run_agent.py index 20b396f01e..0fe6e4a826 100644 --- a/run_agent.py +++ b/run_agent.py @@ -163,6 +163,7 @@ from agent.display import ( get_tool_emoji as _get_tool_emoji, ) from agent.tool_guardrails import ( + ToolCallGuardrailConfig, ToolCallGuardrailController, ToolGuardrailDecision, append_toolguard_guidance, @@ -1666,6 +1667,14 @@ class AIAgent: _agent_cfg = _load_agent_config() except Exception: _agent_cfg = {} + try: + self._tool_guardrails = ToolCallGuardrailController( + ToolCallGuardrailConfig.from_mapping( + _agent_cfg.get("tool_loop_guardrails", {}) + ) + ) + except Exception as _tlg_err: + logger.warning("Tool loop guardrail config ignored: %s", _tlg_err) # Cache only the derived auxiliary compression context override that is # needed later by the startup feasibility check. Avoid exposing a # broad pseudo-public config object on the agent instance. diff --git a/tests/agent/test_tool_guardrails.py b/tests/agent/test_tool_guardrails.py index 18999b2f39..c50be56f43 100644 --- a/tests/agent/test_tool_guardrails.py +++ b/tests/agent/test_tool_guardrails.py @@ -32,9 +32,69 @@ def test_tool_call_signature_hashes_canonical_nested_unicode_args_without_exposi assert "☤" not in json.dumps(metadata) -def test_repeated_identical_failed_call_warns_then_blocks_before_third_execution(): +def test_default_config_is_soft_warning_only_with_hard_stop_disabled(): + cfg = ToolCallGuardrailConfig() + + assert cfg.warnings_enabled is True + assert cfg.hard_stop_enabled is False + assert cfg.exact_failure_warn_after == 2 + assert cfg.same_tool_failure_warn_after == 3 + assert cfg.no_progress_warn_after == 2 + assert cfg.exact_failure_block_after == 5 + assert cfg.same_tool_failure_halt_after == 8 + assert cfg.no_progress_block_after == 5 + + +def test_config_parses_nested_warn_and_hard_stop_thresholds(): + cfg = ToolCallGuardrailConfig.from_mapping( + { + "warnings_enabled": False, + "hard_stop_enabled": True, + "warn_after": { + "exact_failure": 3, + "same_tool_failure": 4, + "idempotent_no_progress": 5, + }, + "hard_stop_after": { + "exact_failure": 6, + "same_tool_failure": 7, + "idempotent_no_progress": 8, + }, + } + ) + + assert cfg.warnings_enabled is False + assert cfg.hard_stop_enabled is True + assert cfg.exact_failure_warn_after == 3 + assert cfg.same_tool_failure_warn_after == 4 + assert cfg.no_progress_warn_after == 5 + assert cfg.exact_failure_block_after == 6 + assert cfg.same_tool_failure_halt_after == 7 + assert cfg.no_progress_block_after == 8 + + +def test_default_repeated_identical_failed_call_warns_without_blocking(): + controller = ToolCallGuardrailController() + args = {"query": "same"} + + decisions = [] + for _ in range(5): + assert controller.before_call("web_search", args).action == "allow" + decisions.append( + controller.after_call("web_search", args, '{"error":"boom"}', failed=True) + ) + + assert decisions[0].action == "allow" + assert [d.action for d in decisions[1:]] == ["warn", "warn", "warn", "warn"] + assert {d.code for d in decisions[1:]} == {"repeated_exact_failure_warning"} + assert controller.before_call("web_search", args).action == "allow" + assert controller.halt_decision is None + + +def test_hard_stop_enabled_blocks_repeated_exact_failure_before_next_execution(): controller = ToolCallGuardrailController( ToolCallGuardrailConfig( + hard_stop_enabled=True, exact_failure_warn_after=2, exact_failure_block_after=2, same_tool_failure_halt_after=99, @@ -50,18 +110,16 @@ def test_repeated_identical_failed_call_warns_then_blocks_before_third_execution second = controller.after_call("web_search", args, '{"error":"boom"}', failed=True) assert second.action == "warn" assert second.code == "repeated_exact_failure_warning" - assert second.count == 2 blocked = controller.before_call("web_search", args) assert blocked.action == "block" assert blocked.code == "repeated_exact_failure_block" - assert blocked.tool_name == "web_search" assert blocked.count == 2 def test_success_resets_exact_signature_failure_streak(): controller = ToolCallGuardrailController( - ToolCallGuardrailConfig(exact_failure_block_after=2, same_tool_failure_halt_after=99) + ToolCallGuardrailConfig(hard_stop_enabled=True, exact_failure_block_after=2, same_tool_failure_halt_after=99) ) args = {"query": "same"} @@ -73,9 +131,26 @@ def test_success_resets_exact_signature_failure_streak(): assert controller.before_call("web_search", args).action == "allow" -def test_same_tool_varying_args_failure_streak_warns_then_halts_independent_of_exact_streak(): +def test_same_tool_varying_args_warns_by_default_without_halting(): + controller = ToolCallGuardrailController( + ToolCallGuardrailConfig(same_tool_failure_warn_after=2, same_tool_failure_halt_after=3) + ) + + first = controller.after_call("terminal", {"command": "cmd-1"}, '{"exit_code":1}', failed=True) + second = controller.after_call("terminal", {"command": "cmd-2"}, '{"exit_code":1}', failed=True) + third = controller.after_call("terminal", {"command": "cmd-3"}, '{"exit_code":1}', failed=True) + fourth = controller.after_call("terminal", {"command": "cmd-4"}, '{"exit_code":1}', failed=True) + + assert first.action == "allow" + assert [second.action, third.action, fourth.action] == ["warn", "warn", "warn"] + assert {second.code, third.code, fourth.code} == {"same_tool_failure_warning"} + assert controller.halt_decision is None + + +def test_hard_stop_enabled_halts_same_tool_varying_args_failure_streak(): controller = ToolCallGuardrailController( ToolCallGuardrailConfig( + hard_stop_enabled=True, exact_failure_block_after=99, same_tool_failure_warn_after=2, same_tool_failure_halt_after=3, @@ -93,13 +168,34 @@ def test_same_tool_varying_args_failure_streak_warns_then_halts_independent_of_e assert third.count == 3 -def test_idempotent_no_progress_repeated_result_warns_then_blocks_future_repeat(): +def test_idempotent_no_progress_repeated_result_warns_without_blocking_by_default(): controller = ToolCallGuardrailController( ToolCallGuardrailConfig(no_progress_warn_after=2, no_progress_block_after=2) ) args = {"path": "/tmp/same.txt"} result = "same file contents" + for _ in range(4): + assert controller.before_call("read_file", args).action == "allow" + decision = controller.after_call("read_file", args, result, failed=False) + + assert decision.action == "warn" + assert decision.code == "idempotent_no_progress_warning" + assert controller.before_call("read_file", args).action == "allow" + assert controller.halt_decision is None + + +def test_hard_stop_enabled_blocks_idempotent_no_progress_future_repeat(): + controller = ToolCallGuardrailController( + ToolCallGuardrailConfig( + hard_stop_enabled=True, + no_progress_warn_after=2, + no_progress_block_after=2, + ) + ) + args = {"path": "/tmp/same.txt"} + result = "same file contents" + assert controller.before_call("read_file", args).action == "allow" assert controller.after_call("read_file", args, result, failed=False).action == "allow" assert controller.before_call("read_file", args).action == "allow" @@ -126,7 +222,7 @@ def test_mutating_or_unknown_tools_are_not_blocked_for_repeated_identical_succes def test_reset_for_turn_clears_bounded_guardrail_state(): controller = ToolCallGuardrailController( - ToolCallGuardrailConfig(exact_failure_block_after=2, no_progress_block_after=2) + ToolCallGuardrailConfig(hard_stop_enabled=True, exact_failure_block_after=2, no_progress_block_after=2) ) controller.after_call("web_search", {"query": "same"}, '{"error":"boom"}', failed=True) controller.after_call("web_search", {"query": "same"}, '{"error":"boom"}', failed=True) diff --git a/tests/run_agent/test_tool_call_guardrail_runtime.py b/tests/run_agent/test_tool_call_guardrail_runtime.py index 1b138b02e1..3b15f4f1cc 100644 --- a/tests/run_agent/test_tool_call_guardrail_runtime.py +++ b/tests/run_agent/test_tool_call_guardrail_runtime.py @@ -36,10 +36,11 @@ def _mock_response(content="Hello", finish_reason="stop", tool_calls=None): return SimpleNamespace(choices=[choice], model="test/model", usage=None) -def _make_agent(*tool_names: str, max_iterations: int = 10) -> AIAgent: +def _make_agent(*tool_names: str, max_iterations: int = 10, config: dict | None = None) -> AIAgent: with ( patch("run_agent.get_tool_definitions", return_value=_make_tool_defs(*tool_names)), patch("run_agent.check_toolset_requirements", return_value={}), + patch("hermes_cli.config.load_config", return_value=config or {}), patch("run_agent.OpenAI"), ): agent = AIAgent( @@ -69,7 +70,23 @@ def _seed_exact_failures(agent: AIAgent, tool_name: str, args: dict, count: int ) -def test_sequential_path_blocks_repeated_exact_failure_before_execution(): +def _hard_stop_config(**overrides) -> dict: + cfg = { + "tool_loop_guardrails": { + "warnings_enabled": True, + "hard_stop_enabled": True, + "hard_stop_after": { + "exact_failure": 2, + "same_tool_failure": 8, + "idempotent_no_progress": 5, + }, + } + } + cfg["tool_loop_guardrails"].update(overrides) + return cfg + + +def test_default_sequential_path_warns_repeated_exact_failure_without_blocking_execution(): agent = _make_agent("web_search") args = {"query": "same"} _seed_exact_failures(agent, "web_search", args) @@ -77,6 +94,32 @@ def test_sequential_path_blocks_repeated_exact_failure_before_execution(): progress = [] agent.tool_start_callback = lambda *a, **k: starts.append((a, k)) agent.tool_progress_callback = lambda *a, **k: progress.append((a, k)) + tc = _mock_tool_call("web_search", json.dumps(args), "c-soft") + msg = SimpleNamespace(content="", tool_calls=[tc]) + messages = [] + + with patch("run_agent.handle_function_call", return_value=json.dumps({"error": "boom"})) as mock_hfc: + agent._execute_tool_calls_sequential(msg, messages, "task-1") + + mock_hfc.assert_called_once() + assert len(starts) == 1 + assert any(event[0][0] == "tool.completed" for event in progress) + assert len(messages) == 1 + assert messages[0]["role"] == "tool" + assert messages[0]["tool_call_id"] == "c-soft" + assert "repeated_exact_failure_warning" in messages[0]["content"] + assert "repeated_exact_failure_block" not in messages[0]["content"] + assert agent._tool_guardrail_halt_decision is None + + +def test_config_enabled_hard_stop_blocks_repeated_exact_failure_before_execution(): + agent = _make_agent("web_search", config=_hard_stop_config()) + args = {"query": "same"} + _seed_exact_failures(agent, "web_search", args) + starts = [] + progress = [] + agent.tool_start_callback = lambda *a, **k: starts.append((a, k)) + agent.tool_progress_callback = lambda *a, **k: progress.append((a, k)) tc = _mock_tool_call("web_search", json.dumps(args), "c-block") msg = SimpleNamespace(content="", tool_calls=[tc]) messages = [] @@ -106,12 +149,12 @@ def test_sequential_after_call_appends_guidance_to_tool_result_without_extra_mes assert [m["role"] for m in messages] == ["tool"] assert messages[0]["tool_call_id"] == "c-warn" - assert "Tool guardrail" in messages[0]["content"] + assert "Tool loop warning" in messages[0]["content"] assert "repeated_exact_failure_warning" in messages[0]["content"] -def test_concurrent_path_does_not_submit_blocked_calls_and_preserves_result_order(): - agent = _make_agent("web_search") +def test_config_enabled_hard_stop_concurrent_path_does_not_submit_blocked_calls_and_preserves_result_order(): + agent = _make_agent("web_search", config=_hard_stop_config()) blocked_args = {"query": "blocked"} allowed_args = {"query": "allowed"} _seed_exact_failures(agent, "web_search", blocked_args) @@ -164,9 +207,39 @@ def test_plugin_pre_tool_block_wins_without_counting_as_toolguard_block(): assert agent._tool_guardrails.before_call("web_search", args).action == "allow" -def test_run_conversation_returns_controlled_guardrail_halt_without_top_level_error(): +def test_default_run_conversation_warns_without_guardrail_halt(): agent = _make_agent("web_search", max_iterations=10) same_args = {"query": "same"} + responses = [ + _mock_response( + content="", + finish_reason="tool_calls", + tool_calls=[_mock_tool_call("web_search", json.dumps(same_args), f"c{i}")], + ) + for i in range(1, 4) + ] + responses.append(_mock_response(content="done", finish_reason="stop", tool_calls=None)) + agent.client.chat.completions.create.side_effect = responses + + with ( + patch("run_agent.handle_function_call", return_value=json.dumps({"error": "boom"})) as mock_hfc, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("search repeatedly") + + assert mock_hfc.call_count == 3 + assert result["turn_exit_reason"].startswith("text_response") + assert "guardrail" not in result + assert result["final_response"] == "done" + tool_contents = [m["content"] for m in result["messages"] if m.get("role") == "tool"] + assert any("repeated_exact_failure_warning" in content for content in tool_contents) + + +def test_config_enabled_hard_stop_run_conversation_returns_controlled_guardrail_halt_without_top_level_error(): + agent = _make_agent("web_search", max_iterations=10, config=_hard_stop_config()) + same_args = {"query": "same"} responses = [ _mock_response( content="",